In [0]:
df = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("/FileStore/tables/2010_12_01-1.csv")
# /FileStore/tables/2010_12_01-1.csv
df.printSchema()
df.createOrReplaceTempView("dfTable")


In [0]:
df.head(5)

lit function -converts a type in another language to its corresponding Spark representation

In [0]:
from pyspark.sql.functions import lit
df.select(lit(5), lit("five"), lit(5.0))


specify equality:

In [0]:
from pyspark.sql.functions import col
df.where(col("InvoiceNo") != 536365)\
  .select("InvoiceNo", "Description")\
  .show(5, False)


Chain together filters as a sequential filter:

In [0]:
from pyspark.sql.functions import instr
priceFilter = col("UnitPrice") > 600
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()


Filter a DataFrame by specifying a Boolean column:

In [0]:
from pyspark.sql.functions import instr
DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
descripFilter = instr(col("Description"), "POSTAGE") >= 1
df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter))\
  .where("isExpensive")\
  .select("unitPrice", "isExpensive").show(5)


Expressed as a where clause a la SQL. Easier to express filters as SQL statements than using the programmatic DataFrame interface and Spark SQL allows us to do this without paying any performance penalty:

In [0]:
from pyspark.sql.functions import expr
df.withColumn("isExpensive", expr("NOT UnitPrice <= 250"))\
  .where("isExpensive")\
  .select("Description", "UnitPrice").show(5)


If you’re working with null data when creating Boolean expressions, must perform a null-safe equivalence test:

In [0]:
df.where(col("Description").eqNullSafe("hello")).show()

fabricate a contrived example 
* let’s imagine that we found out that we mis-recorded the quantity in our retail dataset 
* true quantity is equal to (the current quantity * the unitprice) + 5.

In [0]:
from pyspark.sql.functions import expr, pow
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2)


as a SQLexpression:

In [0]:
df.selectExpr(
  "CustomerId",
  "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)


rounding -  easiest to cast the value to an integer. Spark has more detailed functions for performing this explicitly and to a certain level of precision. In the following example, we round to one decimal place. By default, the round function rounds up if you’re exactly in between two numbers. You can round down by using the **bround**

In [0]:
# in Python
from pyspark.sql.functions import lit,round,bround
df.select(round(lit("2.5")),bround(lit("2.5"))).show(2)

Compute the correlation of two columns.See the Pearson correlation coefficient for two columns (to see if cheaper things are typically bought in greater quantities). Can do this through a function as well as through the DataFrame statistic methods:

In [0]:
from pyspark.sql.functions import corr
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()


Summary Stats:

In [0]:
df.describe().show()


individual functions:

In [0]:
from pyspark.sql.functions import count, mean, stddev_pop, min, max


Calculate either exact or approximate quantiles of your data using the approxQuantile method:

In [0]:
colName = "UnitPrice"
quantileProbs = [0.5]
relError = 0.05
df.stat.approxQuantile("UnitPrice", quantileProbs, relError) # 2.51


see a cross-tabulation:

In [0]:
df.stat.crosstab("StockCode", "Quantity").show()


frequent item pairs:

In [0]:
df.stat.freqItems(["StockCode", "Quantity"]).show()


Add a unique ID to each row by using the function monotonically_increasing_id. This function generates a unique value for each row, starting with 0:

In [0]:
from pyspark.sql.functions import monotonically_increasing_id
df.select(monotonically_increasing_id()).show(2)


initcap - function will capitalize every word in a given string when that word is separated from another by a space:

In [0]:
from pyspark.sql.functions import initcap
df.select(initcap(col("Description"))).show()


Casting Cases:

In [0]:
from pyspark.sql.functions import lower, upper
df.select(col("Description"),
    lower(col("Description")),
    upper(lower(col("Description")))).show(2)


Adding or removing spaces around a string - you can do this by using lpad, ltrim, rpad and rtrim, trim:

In [0]:
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)


RegEx - replace substitute color names in our description column

In [0]:
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
  regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
  col("Description")).show(7)


Spark provides the translate function to replace these values. This is done at the character level and will replace all instances of a character with the indexed character in the replacement string (leetspeak example, lol):

In [0]:
from pyspark.sql.functions import translate
df.select(translate(col("Description"), "LEET", "1337"),col("Description"))\
  .show(2)


extracting first mentioned color:

In [0]:
from pyspark.sql.functions import regexp_extract
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
     regexp_extract(col("Description"), extract_str, 1).alias("color_clean"),
     col("Description")).show(2)


Check for  existence. We can do this with the contains method on each column. This will return a Boolean declaring whether the value you specify is in the column’s string:

In [0]:
from pyspark.sql.functions import instr
containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1
df.withColumn("hasSimpleColor", containsBlack | containsWhite)\
  .where("hasSimpleColor")\
  .select("Description").show(3, False)


locate - returns the integer location (1 based location),  then converts that to a Boolean before using it as the same basic feature:

In [0]:
from pyspark.sql.functions import expr, locate
simpleColors = ["black", "white", "red", "green", "blue"]
def color_locator(column, color_string):
  return locate(color_string.upper(), column)\
          .cast("boolean")\
          .alias("is_" + color_string)
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(expr("*")) # has to a be Column type

df.select(*selectedColumns).where(expr("is_white OR is_red"))\
  .select("Description").show(3, False)


get the current date and the current timestamps:

In [0]:
from pyspark.sql.functions import current_date, current_timestamp
dateDF = spark.range(10)\
  .withColumn("today", current_date())\
  .withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")


Add and subtract five days from today. These functions take a column and then the number of days to either add or subtract as the arguments:

In [0]:
from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)


* datediff function - will return the number of days in between two dates
* months_between - gives you the number of months between two dates

In [0]:
from pyspark.sql.functions import datediff, months_between, to_date
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
  .select(datediff(col("week_ago"), col("today"))).show(1)

dateDF.select(
    to_date(lit("2016-01-01")).alias("start"),
    to_date(lit("2017-05-22")).alias("end"))\
  .select(months_between(col("start"), col("end"))).show(1)


In [0]:
from pyspark.sql.functions import to_date, lit
spark.range(5).withColumn("date", lit("2017-01-01"))\
  .select(to_date(col("date"))).show(1)


Spark will not throw an error if it cannot parse the date; rather, it will just return null. This can be a bit tricky in larger pipelines because you might be expecting your data in one format and getting it in another. To illustrate, let’s take a look at the date format that has switched from year-month-day to year-day-month. Spark will fail to parse this date and silently return null instead:

In [0]:
dateDF.select(to_date(lit("2016-20-12")),to_date(lit("2017-12-11"))).show(1)

In [0]:
from pyspark.sql.functions import to_date
dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1).select(
    to_date(lit("2017-12-11"), dateFormat).alias("date"),
    to_date(lit("2017-20-12"), dateFormat).alias("date2"))
cleanDateDF.createOrReplaceTempView("dateTable2")


to_timestamp - which always requires a format to be specified

In [0]:
from pyspark.sql.functions import to_timestamp
cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show()


Select the first non-null value from a set of columns by using the coalesce function. In this case, there are no null values, so it simply returns the first column:

In [0]:
from pyspark.sql.functions import coalesce
df.select(coalesce(col("Description"), col("CustomerId"))).show()


In [0]:
df.na.drop("all", subset=["StockCode", "InvoiceNo"])


fill function - you can fill one or more columns with a set of values. This can be done by specifying a map—that is a particular value and a set of columns

In [0]:
df.na.fill("all", subset=["StockCode", "InvoiceNo"])


In [0]:
fill_cols_vals = {"StockCode": 5, "Description" : "No Value"}
df.na.fill(fill_cols_vals)


Replace - replace all values in a certain column according to their current value. The only requirement is that this value be the same type as the original value.

In [0]:
df.na.replace([""], ["UNKNOWN"], "Description")


In [0]:
from pyspark.sql.functions import struct
complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")


Split function and specify the delimiter:

In [0]:
from pyspark.sql.functions import split
df.select(split(col("Description"), " "), col("Description")).show(2)


Manipulate this complex type as another column:

In [0]:
df.select(split(col("Description"), " ").alias("array_col"))\
  .selectExpr("array_col[0]").show(2)


In [0]:
from pyspark.sql.functions import size
df.select(size(split(col("Description"), " "))).show(2) # shows 5 and 3


In [0]:
from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)


In [0]:
from pyspark.sql.functions import split, explode

df.withColumn("splitted", split(col("Description"), " "))\
  .withColumn("exploded", explode(col("splitted")))\
  .select("Description", "InvoiceNo", "exploded").show(4)


Maps are created by using the map function and key-value pairs of columns. You then can select them just like you might select from an array:

In [0]:
from pyspark.sql.functions import create_map
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .show(2)


In [0]:
df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .selectExpr("complex_map['WHITE METAL LANTERN']").show(2)


In [0]:
df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .selectExpr("explode(complex_map)").show(2)


In [0]:
jsonDF = spark.range(1).selectExpr("""
  '{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")


In [0]:
from pyspark.sql.functions import get_json_object, json_tuple

jsonDF.select(
    get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]") as "column",
    json_tuple(col("jsonString"), "myJSONKey")).show(2)


In [0]:
from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct")))


In [0]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import *
parseSchema = StructType((
  StructField("InvoiceNo",StringType(),True),
  StructField("Description",StringType(),True)))
df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct")).alias("newJSON"))\
  .select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2)


In [0]:
udfExampleDF = spark.range(5).toDF("num")
def power3(double_value):
  return double_value ** 3
power3(2.0)


In [0]:
from pyspark.sql.functions import udf
power3udf = udf(power3)


In [0]:
from pyspark.sql.functions import col
udfExampleDF.select(power3udf(col("num"))).show(2)


register a function to make it available as a DataFrame function:

In [0]:
from pyspark.sql.types import IntegerType, DoubleType
spark.udf.register("power3py", power3, DoubleType())


If you specify the type that doesn’t align with the actual type returned by the function, Spark will not throw an error but will just return null to designate a failure:

In [0]:
udfExampleDF.selectExpr("power3py(num)").show(2)
# registered via Python
