In [28]:
from time import sleep

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F

spark = SparkSession. \
    builder. \
    appName("Joins"). \
    master("local"). \
    config("spark.jars", "data/jars/postgresql-42.2.19.jar"). \
    getOrCreate()


In [2]:
movies_df = spark.read.json("data/movies")
assert(movies_df.count() != 0)

# Filters

In [5]:
# demo_literal_values

meaning_of_life_df = movies_df.select(col("Title"), lit(42).alias("MOL"))
meaning_of_life_df.show(5, False)

+--------------------------+---+
|Title                     |MOL|
+--------------------------+---+
|The Land Girls            |42 |
|First Love, Last Rites    |42 |
|I Married a Strange Person|42 |
|Let's Talk About Sex      |42 |
|Slam                      |42 |
+--------------------------+---+
only showing top 5 rows



In [6]:
# demo_booleans

drama_filter = movies_df.Major_Genre == "Drama" # column object of TYPE boolean
good_rating_filter = movies_df.IMDB_Rating > 7.0
# can use & (and), | (or), ~ (not)
good_drama_filter = good_rating_filter & drama_filter

# can use boolean column objects as arguments to filter
good_dramas_df = movies_df.filter(good_drama_filter).select("Title", "Major_Genre", "IMDB_Rating")
good_dramas_df.show()


+--------------------+-----------+-----------+
|               Title|Major_Genre|IMDB_Rating|
+--------------------+-----------+-----------+
|        12 Angry Men|      Drama|        8.9|
|      Twelve Monkeys|      Drama|        8.1|
|    Twin Falls Idaho|      Drama|        7.1|
|                Amen|      Drama|        7.4|
|        Barry Lyndon|      Drama|        8.1|
|      Before Sunrise|      Drama|        8.0|
|The Best Years of...|      Drama|        8.2|
|      The Big Parade|      Drama|        8.4|
|     Boyz n the Hood|      Drama|        7.8|
|De battre mon coe...|      Drama|        7.3|
|The Birth of a Na...|      Drama|        7.1|
|The Bridge on the...|      Drama|        8.4|
|Born on the Fourt...|      Drama|        7.2|
|The Bridges of Ma...|      Drama|        7.2|
|          Braveheart|      Drama|        8.4|
|    Chariots of Fire|      Drama|        7.3|
|Cat on a Hot Tin ...|      Drama|        8.0|
|    The Color Purple|      Drama|        7.7|
|   Central d

In [7]:
# can add the col object as a column/property for every row
movies_with_good_drama_condition_df = movies_df\
    .select(col("Title"), good_drama_filter.alias("IsItAGoodDrama"))

movies_with_good_drama_condition_df.show(5, False)

+--------------------------+--------------+
|Title                     |IsItAGoodDrama|
+--------------------------+--------------+
|The Land Girls            |false         |
|First Love, Last Rites    |false         |
|I Married a Strange Person|false         |
|Let's Talk About Sex      |false         |
|Slam                      |false         |
+--------------------------+--------------+
only showing top 5 rows



In [8]:
# can filter using the true/false value of a column
good_dramas_df_v2 = movies_with_good_drama_condition_df.filter("IsItAGoodDrama")

good_dramas_df_v2.show(5, False)

+----------------+--------------+
|Title           |IsItAGoodDrama|
+----------------+--------------+
|12 Angry Men    |true          |
|Twelve Monkeys  |true          |
|Twin Falls Idaho|true          |
|Amen            |true          |
|Barry Lyndon    |true          |
+----------------+--------------+
only showing top 5 rows



In [9]:
# negation
bad_drama_filter = ~good_drama_filter
bad_dramas = movies_df.select(col("Title"), bad_drama_filter)
bad_dramas.show(5, False)


+--------------------------+-----------------------------------------------------+
|Title                     |(NOT ((IMDB_Rating > 7.0) AND (Major_Genre = Drama)))|
+--------------------------+-----------------------------------------------------+
|The Land Girls            |true                                                 |
|First Love, Last Rites    |true                                                 |
|I Married a Strange Person|true                                                 |
|Let's Talk About Sex      |true                                                 |
|Slam                      |true                                                 |
+--------------------------+-----------------------------------------------------+
only showing top 5 rows



# Stat num functions

In [11]:
movies_avg_ratings_df = movies_df\
    .select(
    col("Title"),
    (col("Rotten_Tomatoes_Rating") / 10 + col("IMDB_Rating")) / 2
)
# can use ==, >=, >, <, <= to obtain boolean col objects


# Pearson correlation - for numerical fields
# a number [-1, 1]
# is an "action" (the DF must be evaluated)
rating_correlation = movies_df.stat.corr("IMDB_Rating", "Rotten_Tomatoes_Rating")
print(rating_correlation)


0.4259708986248316


# String functions

In [12]:
movies_df.select(initcap(col("Title"))) 
# capitalize initials of every word in the string
# upper(...), lower(...) to uppercase/lowercase

movies_df.filter(col("Title").contains("love")).show(5, False)


+------------------+--------------+--------------------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+-------------------------+-----------+------------+--------+---------------+
|Creative_Type     |Director      |Distributor         |IMDB_Rating|IMDB_Votes|MPAA_Rating|Major_Genre|Production_Budget|Release_Date|Rotten_Tomatoes_Rating|Running_Time_min|Source                   |Title      |US_DVD_Sales|US_Gross|Worldwide_Gross|
+------------------+--------------+--------------------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+-------------------------+-----------+------------+--------+---------------+
|Science Fiction   |Matt Reeves   |Paramount Pictures  |7.4        |136068    |PG-13      |Action     |25000000         |18-Jan-08   |76                    |null            |Original Screenplay      |Cloverfield|29180398    |80048433|170764033    

# Regexes filtering

In [14]:
cars_df = spark.read.json("data/cars")

regexString = "volkswagen|vw"
vw_df = cars_df.select(
    col("Name"),
    regexp_extract(col("Name"), regexString, 0).alias("regex_extract")
).filter(col("regex_extract") != "")

vw_df.show(5, False)

+----------------------------+-------------+
|Name                        |regex_extract|
+----------------------------+-------------+
|volkswagen 1131 deluxe sedan|volkswagen   |
|volkswagen super beetle 117 |volkswagen   |
|volkswagen model 111        |volkswagen   |
|volkswagen type 3           |volkswagen   |
|volkswagen 411 (sw)         |volkswagen   |
+----------------------------+-------------+
only showing top 5 rows



In [15]:
vw_new_name_df = vw_df.select(
    col("Name"),
    regexp_replace(col("Name"), regexString, "Volkswagen").alias("replacement")
)
vw_new_name_df.show(5, False)

+----------------------------+----------------------------+
|Name                        |replacement                 |
+----------------------------+----------------------------+
|volkswagen 1131 deluxe sedan|Volkswagen 1131 deluxe sedan|
|volkswagen super beetle 117 |Volkswagen super beetle 117 |
|volkswagen model 111        |Volkswagen model 111        |
|volkswagen type 3           |Volkswagen type 3           |
|volkswagen 411 (sw)         |Volkswagen 411 (sw)         |
+----------------------------+----------------------------+
only showing top 5 rows



# Exercise

    Filter the cars DF, return all cars whose name contains either element of the list
    - contains function
    - regexes

In [19]:
def get_car_names():
    return ["Volkswagen", "Mercedes-Benz", "Ford"]

# v1 - regexes
regexString = "|".join(get_car_names()) # Volkswagen|Mercedes-Benz|Ford
cars_interest_df = cars_df.select(
        col("Name"),
        regexp_extract(lower(col("Name")), regexString, 0).alias("regex_extract")
    ).filter(col("regex_extract") != "").orderBy(col("regex_extract"))

cars_interest_df.show(5, False)

+----+-------------+
|Name|regex_extract|
+----+-------------+
+----+-------------+



In [21]:
# v2 - contains
from functools import reduce

car_name_filters = [col("Name").contains(car_name.lower()) for car_name in ["Volkswagen", "Mercedes-Benz", "Ford"]]
big_filter = reduce(lambda filter1, filter2: filter1 | filter2, car_name_filters)
filtered_cars = cars_df.filter(big_filter)

filtered_cars.show(5, False)

+------------+---------+------------+----------+----------------+---------------------+------+-------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|Name                 |Origin|Weight_in_lbs|Year      |
+------------+---------+------------+----------+----------------+---------------------+------+-------------+----------+
|10.5        |8        |302.0       |140       |17.0            |ford torino          |USA   |3449         |1970-01-01|
|10.0        |8        |429.0       |198       |15.0            |ford galaxie 500     |USA   |4341         |1970-01-01|
|11.0        |8        |351.0       |153       |null            |ford torino (sw)     |USA   |4034         |1970-01-01|
|8.0         |8        |302.0       |140       |null            |ford mustang boss 302|USA   |3353         |1970-01-01|
|16.0        |6        |200.0       |85        |21.0            |ford maverick        |USA   |2587         |1970-01-01|
+------------+---------+------------+---

# Date type

In [29]:
# TODO Not working in Spark 2.4
# How to conver data

movies_with_release_dates_df = movies_df.select(
    col("Title"),
    to_date(col("Release_Date"), "dd-MMM-YY")
)

#     to_date(col("Release_Date"), "dd-MMM-YY").alias("Actual_Release")

movies_with_release_dates_df.show()

Py4JJavaError: An error occurred while calling o284.showString.
: org.apache.spark.SparkUpgradeException: You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd-MMM-YY' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failToRecognizePatternAfterUpgradeError(QueryExecutionErrors.scala:936)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkLegacyFormatter$1.applyOrElse(DateTimeFormatterHelper.scala:187)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$$anonfun$checkLegacyFormatter$1.applyOrElse(DateTimeFormatterHelper.scala:180)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.validatePatternString(TimestampFormatter.scala:153)
	at org.apache.spark.sql.catalyst.util.TimestampFormatter$.getFormatter(TimestampFormatter.scala:351)
	at org.apache.spark.sql.catalyst.util.TimestampFormatter$.apply(TimestampFormatter.scala:394)
	at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.getFormatter(datetimeExpressions.scala:90)
	at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.getFormatter$(datetimeExpressions.scala:84)
	at org.apache.spark.sql.catalyst.expressions.ToTimestamp.getFormatter(datetimeExpressions.scala:1169)
	at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.$anonfun$formatterOption$1(datetimeExpressions.scala:81)
	at scala.Option.map(Option.scala:230)
	at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.formatterOption(datetimeExpressions.scala:81)
	at org.apache.spark.sql.catalyst.expressions.TimestampFormatterHelper.formatterOption$(datetimeExpressions.scala:79)
	at org.apache.spark.sql.catalyst.expressions.ToTimestamp.formatterOption$lzycompute(datetimeExpressions.scala:1169)
	at org.apache.spark.sql.catalyst.expressions.ToTimestamp.formatterOption(datetimeExpressions.scala:1169)
	at org.apache.spark.sql.catalyst.expressions.ToTimestamp.doGenCode(datetimeExpressions.scala:1246)
	at org.apache.spark.sql.catalyst.expressions.Expression.$anonfun$genCode$3(Expression.scala:151)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:146)
	at org.apache.spark.sql.catalyst.expressions.CastBase.doGenCode(Cast.scala:936)
	at org.apache.spark.sql.catalyst.expressions.Expression.$anonfun$genCode$3(Expression.scala:151)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:146)
	at org.apache.spark.sql.catalyst.expressions.CastBase.genCode(Cast.scala:931)
	at org.apache.spark.sql.catalyst.expressions.CastBase.doGenCode(Cast.scala:936)
	at org.apache.spark.sql.catalyst.expressions.Expression.$anonfun$genCode$3(Expression.scala:151)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:146)
	at org.apache.spark.sql.catalyst.expressions.CastBase.genCode(Cast.scala:931)
	at org.apache.spark.sql.catalyst.expressions.Alias.genCode(namedExpressions.scala:171)
	at org.apache.spark.sql.execution.ProjectExec.$anonfun$doConsume$2(basicPhysicalOperators.scala:73)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.execution.ProjectExec.$anonfun$doConsume$1(basicPhysicalOperators.scala:73)
	at org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext.withSubExprEliminationExprs(CodeGenerator.scala:1039)
	at org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:73)
	at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:195)
	at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:150)
	at org.apache.spark.sql.execution.InputAdapter.consume(WholeStageCodegenExec.scala:497)
	at org.apache.spark.sql.execution.InputRDDCodegen.doProduce(WholeStageCodegenExec.scala:484)
	at org.apache.spark.sql.execution.InputRDDCodegen.doProduce$(WholeStageCodegenExec.scala:457)
	at org.apache.spark.sql.execution.InputAdapter.doProduce(WholeStageCodegenExec.scala:497)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:96)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:222)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:219)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:91)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:91)
	at org.apache.spark.sql.execution.InputAdapter.produce(WholeStageCodegenExec.scala:497)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:54)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:96)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:222)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:219)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:91)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:91)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:659)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:722)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:184)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:222)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:219)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:325)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:443)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:429)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:48)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3715)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2728)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3706)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3704)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2728)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2935)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:287)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:326)
	at jdk.internal.reflect.GeneratedMethodAccessor54.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.IllegalArgumentException: All week-based patterns are unsupported since Spark 3.0, detected: Y, Please use the SQL function EXTRACT instead
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$.$anonfun$convertIncompatiblePattern$4(DateTimeFormatterHelper.scala:319)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$.$anonfun$convertIncompatiblePattern$4$adapted(DateTimeFormatterHelper.scala:317)
	at scala.collection.TraversableLike$WithFilter.$anonfun$foreach$1(TraversableLike.scala:985)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.immutable.StringOps.foreach(StringOps.scala:33)
	at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:984)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$.$anonfun$convertIncompatiblePattern$2(DateTimeFormatterHelper.scala:317)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper$.convertIncompatiblePattern(DateTimeFormatterHelper.scala:314)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper.getOrCreateFormatter(DateTimeFormatterHelper.scala:121)
	at org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper.getOrCreateFormatter$(DateTimeFormatterHelper.scala:117)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.getOrCreateFormatter(TimestampFormatter.scala:92)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.formatter$lzycompute(TimestampFormatter.scala:101)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.formatter(TimestampFormatter.scala:100)
	at org.apache.spark.sql.catalyst.util.Iso8601TimestampFormatter.validatePatternString(TimestampFormatter.scala:152)
	... 93 more


In [30]:
# date operations
enriched_movies_df = movies_with_release_dates_df. \
    withColumn("Today", current_date()). \
    withColumn("Right_Now", current_timestamp()). \
    withColumn("Movie_Age", datediff(col("Today"), col("Actual_Release")) / 365)


enriched_movies_df.show()

AnalysisException: cannot resolve 'Actual_Release' given input columns: [Right_Now, Title, Today, to_date(Release_Date, dd-MMM-YY)];
'Project [Title#19, to_date(Release_Date, dd-MMM-YY)#618, Today#630, Right_Now#634, (datediff(Today#630, 'Actual_Release) / 365) AS Movie_Age#639]
+- Project [Title#19, to_date(Release_Date, dd-MMM-YY)#618, Today#630, current_timestamp() AS Right_Now#634]
   +- Project [Title#19, to_date(Release_Date, dd-MMM-YY)#618, current_date(Some(Etc/UTC)) AS Today#630]
      +- Project [Title#19, to_date('Release_Date, Some(dd-MMM-YY)) AS to_date(Release_Date, dd-MMM-YY)#618]
         +- Relation [Creative_Type#7,Director#8,Distributor#9,IMDB_Rating#10,IMDB_Votes#11L,MPAA_Rating#12,Major_Genre#13,Production_Budget#14L,Release_Date#15,Rotten_Tomatoes_Rating#16L,Running_Time_min#17L,Source#18,Title#19,US_DVD_Sales#20L,US_Gross#21L,Worldwide_Gross#22L] json


In [31]:
# check for empty date
no_release_known_df = movies_with_release_dates_df.filter(col("Actual_Release").isNull())
no_release_known_df.show()

AnalysisException: cannot resolve 'Actual_Release' given input columns: [Title, to_date(Release_Date, dd-MMM-YY)];
'Filter isnull('Actual_Release)
+- Project [Title#19, to_date('Release_Date, Some(dd-MMM-YY)) AS to_date(Release_Date, dd-MMM-YY)#618]
   +- Relation [Creative_Type#7,Director#8,Distributor#9,IMDB_Rating#10,IMDB_Votes#11L,MPAA_Rating#12,Major_Genre#13,Production_Budget#14L,Release_Date#15,Rotten_Tomatoes_Rating#16L,Running_Time_min#17L,Source#18,Title#19,US_DVD_Sales#20L,US_Gross#21L,Worldwide_Gross#22L] json


In [None]:
# hypothetical
movies_with_2_formats = movies_df.select(col("Title"), col("Release_Date")). \
    withColumn("Date_F1", to_date(col("Release_Date"), "dd-MM-yyyy")). \
    withColumn("Date_F2", to_date(col("Release_Date"), "yyyy-MM-dd")). \
    withColumn("Actual_Date", coalesce(col("Date_F1"), col("Date_F2")))

# Structures

In [33]:
# structures
print("structures create")
movies_struct_df = movies_df. \
    select(col("Title"), struct(col("US_Gross"), col("Worldwide_Gross"), col("US_DVD_Sales")).alias("Profit"))

movies_struct_df.show()


structures create
+--------------------+--------------------+
|               Title|              Profit|
+--------------------+--------------------+
|      The Land Girls|{146083, 146083, ...|
|First Love, Last ...|{10876, 10876, null}|
|I Married a Stran...|{203134, 203134, ...|
|Let's Talk About Sex|{373615, 373615, ...|
|                Slam|{1009819, 1087521...|
| Mississippi Mermaid|{24551, 2624551, ...|
|           Following|{44705, 44705, null}|
|             Foolish|{6026908, 6026908...|
|             Pirates|{1641825, 6341825...|
|     Duel in the Sun|{20400000, 204000...|
|           Tom Jones|{37600000, 376000...|
|             Oliver!|{37402877, 374028...|
|To Kill A Mocking...|{13129846, 131298...|
|    Tora, Tora, Tora|{29548291, 295482...|
|   Hollywood Shuffle|{5228617, 5228617...|
|Over the Hill to ...|{3000000, 3000000...|
|              Wilson|{2000000, 2000000...|
|        Darling Lili|{5000000, 5000000...|
|The Ten Commandments|{80000000, 800000...|
|        12 An

In [34]:
# get fields

movies_struct_df. \
    select(col("Title"), col("Profit").getField("US_Gross").alias("US_Profit")).\
    show()


+--------------------+---------+
|               Title|US_Profit|
+--------------------+---------+
|      The Land Girls|   146083|
|First Love, Last ...|    10876|
|I Married a Stran...|   203134|
|Let's Talk About Sex|   373615|
|                Slam|  1009819|
| Mississippi Mermaid|    24551|
|           Following|    44705|
|             Foolish|  6026908|
|             Pirates|  1641825|
|     Duel in the Sun| 20400000|
|           Tom Jones| 37600000|
|             Oliver!| 37402877|
|To Kill A Mocking...| 13129846|
|    Tora, Tora, Tora| 29548291|
|   Hollywood Shuffle|  5228617|
|Over the Hill to ...|  3000000|
|              Wilson|  2000000|
|        Darling Lili|  5000000|
|The Ten Commandments| 80000000|
|        12 Angry Men|        0|
+--------------------+---------+
only showing top 20 rows



In [35]:
# structures - SQL expression strings
movies_struct_df_v2 = movies_df. \
    selectExpr("Title", "(US_Gross, Worldwide_Gross, US_DVD_Sales) as Profit"). \
    selectExpr("Title", "Profit.US_Gross as US_Profit")

movies_struct_df_v2.show()

+--------------------+---------+
|               Title|US_Profit|
+--------------------+---------+
|      The Land Girls|   146083|
|First Love, Last ...|    10876|
|I Married a Stran...|   203134|
|Let's Talk About Sex|   373615|
|                Slam|  1009819|
| Mississippi Mermaid|    24551|
|           Following|    44705|
|             Foolish|  6026908|
|             Pirates|  1641825|
|     Duel in the Sun| 20400000|
|           Tom Jones| 37600000|
|             Oliver!| 37402877|
|To Kill A Mocking...| 13129846|
|    Tora, Tora, Tora| 29548291|
|   Hollywood Shuffle|  5228617|
|Over the Hill to ...|  3000000|
|              Wilson|  2000000|
|        Darling Lili|  5000000|
|The Ten Commandments| 80000000|
|        12 Angry Men|        0|
+--------------------+---------+
only showing top 20 rows



In [36]:
# very nested data structures
movies_struct_df_v3 = movies_df. \
    selectExpr("Title",
               "((IMDB_Rating, Rotten_Tomatoes_Rating) as Rating, (US_Gross, Worldwide_Gross, US_DVD_Sales) as Profit) as Success")
print("nested data structures")

movies_struct_df_v3.show()


nested data structures
+--------------------+--------------------+
|               Title|             Success|
+--------------------+--------------------+
|      The Land Girls|{{6.1, null}, {14...|
|First Love, Last ...|{{6.9, null}, {10...|
|I Married a Stran...|{{6.8, null}, {20...|
|Let's Talk About Sex|{{null, 13}, {373...|
|                Slam|{{3.4, 62}, {1009...|
| Mississippi Mermaid|{{null, null}, {2...|
|           Following|{{7.7, null}, {44...|
|             Foolish|{{3.8, null}, {60...|
|             Pirates|{{5.8, 25}, {1641...|
|     Duel in the Sun|{{7.0, 86}, {2040...|
|           Tom Jones|{{7.0, 81}, {3760...|
|             Oliver!|{{7.5, 84}, {3740...|
|To Kill A Mocking...|{{8.4, 97}, {1312...|
|    Tora, Tora, Tora|{{null, null}, {2...|
|   Hollywood Shuffle|{{6.8, 87}, {5228...|
|Over the Hill to ...|{{null, null}, {3...|
|              Wilson|{{7.0, null}, {20...|
|        Darling Lili|{{6.1, null}, {50...|
|The Ten Commandments|{{2.5, 90}, {8000...|
|        

In [37]:
movies_struct_df_v3. \
    selectExpr("Title", "Success.Rating.IMDB_Rating as IMDB").show()

movies_struct_df_v3.show()

+--------------------+----+
|               Title|IMDB|
+--------------------+----+
|      The Land Girls| 6.1|
|First Love, Last ...| 6.9|
|I Married a Stran...| 6.8|
|Let's Talk About Sex|null|
|                Slam| 3.4|
| Mississippi Mermaid|null|
|           Following| 7.7|
|             Foolish| 3.8|
|             Pirates| 5.8|
|     Duel in the Sun| 7.0|
|           Tom Jones| 7.0|
|             Oliver!| 7.5|
|To Kill A Mocking...| 8.4|
|    Tora, Tora, Tora|null|
|   Hollywood Shuffle| 6.8|
|Over the Hill to ...|null|
|              Wilson| 7.0|
|        Darling Lili| 6.1|
|The Ten Commandments| 2.5|
|        12 Angry Men| 8.9|
+--------------------+----+
only showing top 20 rows

+--------------------+--------------------+
|               Title|             Success|
+--------------------+--------------------+
|      The Land Girls|{{6.1, null}, {14...|
|First Love, Last ...|{{6.9, null}, {10...|
|I Married a Stran...|{{6.8, null}, {20...|
|Let's Talk About Sex|{{null, 13}, {37

# arrays

In [40]:

movies_with_words_df = movies_df.\
    select(col("Title"),
    split(col("Title"), " |,").alias("Title_Words"),
    split(col("Director"), " |,").alias("Director_Words"))

movies_with_words_df.printSchema()
movies_with_words_df.show()

# ^^^^^^^^^^^^^^^^^^^^^^^^ col object of type ARRAY[String]
# you can have nested arrays

root
 |-- Title: string (nullable = true)
 |-- Title_Words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Director_Words: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------------------+--------------------+--------------------+
|               Title|         Title_Words|      Director_Words|
+--------------------+--------------------+--------------------+
|      The Land Girls|  [The, Land, Girls]|                null|
|First Love, Last ...|[First, Love, , L...|                null|
|I Married a Stran...|[I, Married, a, S...|                null|
|Let's Talk About Sex|[Let's, Talk, Abo...|                null|
|                Slam|              [Slam]|                null|
| Mississippi Mermaid|[Mississippi, Mer...|                null|
|           Following|         [Following]|[Christopher, Nolan]|
|             Foolish|           [Foolish]|                null|
|             Pirates|           [Pirates]|   [Roman, Polan

In [41]:
# array operations
array_ops_df = movies_with_words_df.select(
    col("Title"),
    expr("Title_Words[0]"),  # the first element in the array
    size(col("Title_Words")),  # the length of the array
    array_contains(col("Title_Words"), "Love")
    # a bunch of array_(...) functions
)

array_ops_df.show()


+--------------------+--------------+-----------------+---------------------------------+
|               Title|Title_Words[0]|size(Title_Words)|array_contains(Title_Words, Love)|
+--------------------+--------------+-----------------+---------------------------------+
|      The Land Girls|           The|                3|                            false|
|First Love, Last ...|         First|                5|                             true|
|I Married a Stran...|             I|                5|                            false|
|Let's Talk About Sex|         Let's|                4|                            false|
|                Slam|          Slam|                1|                            false|
| Mississippi Mermaid|   Mississippi|                2|                            false|
|           Following|     Following|                1|                            false|
|             Foolish|       Foolish|                1|                            false|
|         

In [43]:
# Flat arrays

array_ops_df = movies_with_words_df.select(
    col("Title"),
    explode(col("Title_Words"))
)

array_ops_df.show()


+--------------------+-----------+
|               Title|        col|
+--------------------+-----------+
|      The Land Girls|        The|
|      The Land Girls|       Land|
|      The Land Girls|      Girls|
|First Love, Last ...|      First|
|First Love, Last ...|       Love|
|First Love, Last ...|           |
|First Love, Last ...|       Last|
|First Love, Last ...|      Rites|
|I Married a Stran...|          I|
|I Married a Stran...|    Married|
|I Married a Stran...|          a|
|I Married a Stran...|    Strange|
|I Married a Stran...|     Person|
|Let's Talk About Sex|      Let's|
|Let's Talk About Sex|       Talk|
|Let's Talk About Sex|      About|
|Let's Talk About Sex|        Sex|
|                Slam|       Slam|
| Mississippi Mermaid|Mississippi|
| Mississippi Mermaid|    Mermaid|
+--------------------+-----------+
only showing top 20 rows

