Creating DataFrame from RDD with with predefined schema

In [None]:
from pyspark.sql import Row

# any RDD (of known structure :) 
r = sc.parallelize([('John',1940),('Paul',1942),('George',1943),('Ringo',1940)])

# converting RDD elements to the special "Row" object
r_row = r.map(lambda x: Row(name=x[0], year=x[1]))

# creating DataFrame from RDD of rows
r_df = spark.createDataFrame(r_row)

# and registering it as temporary view to enable sql queries
r_df.createOrReplaceTempView("df")

# SQL can be run over DataFrames that have been registered as a table.
# Note: "spark" is a predefined SparkSession object
ages = spark.sql("SELECT df.name, df.year, year(current_date()) - df.year as age FROM df ORDER by df.year")

ages.collect()


In [None]:
# import pyspark class Row from module sql
from pyspark.sql import *

# Create Example Data - Departments and Employees

# Create the Departments
department1 = Row(id='123456', name='Computer Science')
department2 = Row(id='789012', name='Mechanical Engineering')
department3 = Row(id='345678', name='Theater and Drama')
department4 = Row(id='901234', name='Indoor Recreation')

# Create the Employees
Employee = Row("firstName", "lastName", "email", "salary")
employee1 = Employee('michael', 'armbrust', 'no-reply@berkeley.edu', 100000)
employee2 = Employee('xiangrui', 'meng', 'no-reply@stanford.edu', 120000)
employee3 = Employee('matei', None, 'no-reply@waterloo.edu', 140000)
employee4 = Employee(None, 'wendell', 'no-reply@berkeley.edu', 160000)
employee5 = Employee('michael', 'jackson', 'no-reply@neverla.nd', 80000)

# Create the DepartmentWithEmployees instances from Departments and Employees
departmentWithEmployees1 = Row(department=department1, employees=[employee1, employee2])
departmentWithEmployees2 = Row(department=department2, employees=[employee3, employee4])
departmentWithEmployees3 = Row(department=department3, employees=[employee5, employee4])
departmentWithEmployees4 = Row(department=department4, employees=[employee2, employee3])

print(department1)
print(employee2)
print(departmentWithEmployees1.employees[0].email)

In [None]:
from pyspark.sql.types import *

# any RDD
r = sc.parallelize([('John',1940),('Paul',1942),('George',1943),('Ringo',1940)])

# and some schema description
schema_str = 'Name string,Year long'

# build a list of fields from schema description string
fields = []
for field in schema_str.split(','):
    field_name,type_name = field.split(' ')
    # the following "if" could be longer to contain all needed types
    if type_name == 'long':
        type = LongType()
    else:
        type = StringType()
    # add new field to the list
    fields.append(StructField(field_name, type, True)) # all fields are nullable for simplicity
    

# define schema as a list of fields
schema = StructType(fields)

# combine data and schema into DataFrame
df = spark.createDataFrame(r, schema)

# just another convinient way to see DataFrame contents
df.show()



In [None]:
# File location and type
file_location = "/FileStore/tables/sample_07.csv"
file_type = "csv"

from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType

customSchema = StructType([
    StructField("code", StringType(), True),        
    StructField("description", StringType(), True),
    StructField("total_emp", IntegerType(), True),
    StructField("salary", IntegerType(), True)
])

# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = "\t"

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .schema(customSchema) \
  .load(file_location)


df.printSchema
#df.show(n = 10, truncate = False)
display(df)

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

# new column from expression, renaming columns, columnar projection
df.withColumn("total_salary",df.salary*df.total_emp)\
  .withColumn("new_column",lit(0))\
  .withColumnRenamed("salary","avg_salary")\
  .show()
  
#.select("code","total_emp","avg_salary","total_salary")\
#.drop("RIP")\

#df.withColumn("total_salary",df.salary.cast(LongType())*df.total_emp)\
#  .withColumnRenamed("salary","avg_salary")\
#  .select("code","total_emp","avg_salary","total_salary")\
#  .show()

# DataFrame API: filtering

In [None]:
# some standard columnar filters
# + SQL-like filtering strings
df.where(df.salary > 50000)\
  .where(df.total_emp.between(100000,300000))\
  .where('code = "11-1011" or code = "11-3011"')\
  .where(df.description.like('A%'))\
  .show()

# DataFrame API: aggregation

In [None]:
# Simple aggregation methods of GroupedData class: avg, count, max, min, sum
# quite convinient, but does not allow different aggregation functions
df.withColumn("code_major",substring(df.code,1,2))\
  .groupBy("code_major")\
  .avg("total_emp","salary")\
  .show(10)

#

In [None]:
from pyspark.sql.functions import pandas_udf, PandasUDFType

@pandas_udf('double', PandasUDFType.SCALAR)
def pandas_plus_one(v):
    # `v` is a pandas Series
    return v.add(1)  # outputs a pandas Series

spark.range(10).select(pandas_plus_one("id")).show()

In [None]:
from pyspark.sql.functions import *

df.createOrReplaceTempView("sample_07")

df = spark.sql('select * from sample_07')


# unified "agg" method with dictionary argument (key = column, value = function)
# note: expression instead of column is not supported
# allows to mix different aggregation functions
df.withColumn("code_major",substring(df.code,1,2))\
  .groupBy("code_major")\
  .agg({"*":"count","salary": "avg", "total_emp": "sum"})\
  .show(10)

In [None]:
from pyspark.sql import functions as sf

df = spark.sql('select * from sample_07')


# unified "agg" method with columnar expressions
# allows to mix different aggregation functions + convinient renaming via .alias
# supports aggregation on expressions
df.withColumn("code_major",substring(df.code,1,2))\
  .groupBy("code_major")\
  .agg(sf.round(sf.avg(df.salary*0.13),2).alias("avg_salary_tax")\
      ,sf.round(sf.sum(df.total_emp),2).alias("sum_employees")\
      )\
  .show(10)


# DataFrame API: joins

In [None]:
from pyspark.sql import functions as sf

df = spark.sql('select * from sample_07')
df = df.withColumn("code_major",sf.substring(df.code,1,2))

# create another DataFrame for join
level_1 = spark.sql("select code, description from sample_07 where code like '%-0000'")
level_1 = level_1.withColumn("code_major",sf.substring(level_1.code,1,2))


# Third argument is the join type
# allowed join types are: inner, cross, outer, full, full_outer, left, left_outer, right, right_outer, left_semi (aka exists), and left_anti (aka not exists)

df.join(level_1, level_1.code_major == df.code_major, 'inner')\
  .select(df.code.alias("child_code"),level_1.code.alias("parent_code"))\
  .show()

# Dataframe execution plan

In [None]:
from pyspark.sql import functions as sf

df = spark.sql('select * from sample_07')
df = df.withColumn("code_major",sf.substring(df.code,1,2))

# create another DataFrame for join
level_1 = spark.sql("select code, description from sample_07 where code like '%-0000'")
level_1 = level_1.withColumn("code_major",sf.substring(level_1.code,1,2))

test = df.join(level_1, level_1.code_major == df.code_major, 'inner')\
  .select(df.code.alias("child_code"),level_1.code.alias("parent_code"))
  
  
print(test.explain(extended = True))

d # Writing to storage
 
DBFS root
The default storage location in DBFS is known as the DBFS root. Several types of data are stored in the following DBFS root locations:


/FileStore: Imported data files, generated plots, and uploaded libraries.

/databricks-datasets: Sample public datasets. See Special DBFS root locations.

/databricks-results: Files generated by downloading the full results of a query.

/databricks/init: Global and cluster-named (deprecated) init scripts.

/user/hive/warehouse: Data and metadata for non-external Hive tables.

In [None]:
# let's read that CSV
#df = spark.read.csv("/FileStore/movies.csv", inferSchema = True, header = True)

#df.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("dbfs:/FileStore/df/Sample.csv")

# and write it back as a compressed CSV
df.write.csv("/FileStore/movies.bzip2", compression = 'bzip2', header = True)

# and read it back (a good idea to check, esp. for backups ;) )
df2 = spark.read.csv("/FileStore/movies.bzip2", inferSchema = True, header = True)

df2.printSchema()
df2.count()