In [2]:
from time import sleep

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F

import findspark
findspark.init()

spark = SparkSession. \
    builder. \
    appName("Joins"). \
    master("local"). \
    config("spark.jars", "jars/postgresql-42.2.19.jar"). \
    getOrCreate()


In [3]:
movies_df = spark.read.json("data/movies")
assert(movies_df.count() != 0)

# Filters

In [4]:
# demo_literal_values

meaning_of_life_df = movies_df.select(col("Title"), lit(42).alias("MOL"))
meaning_of_life_df.show(5, False)

+--------------------------+---+
|Title                     |MOL|
+--------------------------+---+
|The Land Girls            |42 |
|First Love, Last Rites    |42 |
|I Married a Strange Person|42 |
|Let's Talk About Sex      |42 |
|Slam                      |42 |
+--------------------------+---+
only showing top 5 rows



In [5]:
# demo_booleans

drama_filter = movies_df.Major_Genre == "Drama" # column object of TYPE boolean
good_rating_filter = movies_df.IMDB_Rating > 7.0
# can use & (and), | (or), ~ (not)
good_drama_filter = good_rating_filter & drama_filter

# can use boolean column objects as arguments to filter
good_dramas_df = movies_df.filter(good_drama_filter).select("Title", "Major_Genre", "IMDB_Rating")
good_dramas_df.show()


+--------------------+-----------+-----------+
|               Title|Major_Genre|IMDB_Rating|
+--------------------+-----------+-----------+
|        12 Angry Men|      Drama|        8.9|
|      Twelve Monkeys|      Drama|        8.1|
|    Twin Falls Idaho|      Drama|        7.1|
|                Amen|      Drama|        7.4|
|        Barry Lyndon|      Drama|        8.1|
|      Before Sunrise|      Drama|        8.0|
|The Best Years of...|      Drama|        8.2|
|      The Big Parade|      Drama|        8.4|
|     Boyz n the Hood|      Drama|        7.8|
|De battre mon coe...|      Drama|        7.3|
|The Birth of a Na...|      Drama|        7.1|
|The Bridge on the...|      Drama|        8.4|
|Born on the Fourt...|      Drama|        7.2|
|The Bridges of Ma...|      Drama|        7.2|
|          Braveheart|      Drama|        8.4|
|    Chariots of Fire|      Drama|        7.3|
|Cat on a Hot Tin ...|      Drama|        8.0|
|    The Color Purple|      Drama|        7.7|
|   Central d

In [29]:
# can add the col object as a column/property for every row
movies_with_good_drama_condition_df = movies_df\
    .select(col("Title"), col("Major_Genre"), col("IMDB_Rating"), good_drama_filter.alias("IsItAGoodDrama"))

movies_with_good_drama_condition_df.show(5, False)

+--------------------------+-----------+-----------+--------------+
|Title                     |Major_Genre|IMDB_Rating|IsItAGoodDrama|
+--------------------------+-----------+-----------+--------------+
|The Land Girls            |null       |6.1        |false         |
|First Love, Last Rites    |Drama      |6.9        |false         |
|I Married a Strange Person|Comedy     |6.8        |false         |
|Let's Talk About Sex      |Comedy     |null       |false         |
|Slam                      |Drama      |3.4        |false         |
+--------------------------+-----------+-----------+--------------+
only showing top 5 rows



In [30]:
# can filter using the true/false value of a column
good_dramas_df_v2 = movies_with_good_drama_condition_df.filter("IsItAGoodDrama")

good_dramas_df_v2.show(5, False)

+----------------+-----------+-----------+--------------+
|Title           |Major_Genre|IMDB_Rating|IsItAGoodDrama|
+----------------+-----------+-----------+--------------+
|12 Angry Men    |Drama      |8.9        |true          |
|Twelve Monkeys  |Drama      |8.1        |true          |
|Twin Falls Idaho|Drama      |7.1        |true          |
|Amen            |Drama      |7.4        |true          |
|Barry Lyndon    |Drama      |8.1        |true          |
+----------------+-----------+-----------+--------------+
only showing top 5 rows



In [31]:
# negation
bad_drama_filter = ~good_drama_filter
bad_dramas = movies_df.select(col("Title"), bad_drama_filter)
bad_dramas.show(5, False)


+--------------------------+-----------------------------------------------------+
|Title                     |(NOT ((IMDB_Rating > 7.0) AND (Major_Genre = Drama)))|
+--------------------------+-----------------------------------------------------+
|The Land Girls            |true                                                 |
|First Love, Last Rites    |true                                                 |
|I Married a Strange Person|true                                                 |
|Let's Talk About Sex      |true                                                 |
|Slam                      |true                                                 |
+--------------------------+-----------------------------------------------------+
only showing top 5 rows



# Stat num functions

In [34]:
# CAN USE  +  -  /   *  
movies_avg_ratings_df = movies_df\
    .select(
    col("Title"),
    (col("Rotten_Tomatoes_Rating") / 10 + col("IMDB_Rating")) / 2
)
movies_avg_ratings_df.show()

+--------------------+---------------------------------------------------+
|               Title|(((Rotten_Tomatoes_Rating / 10) + IMDB_Rating) / 2)|
+--------------------+---------------------------------------------------+
|      The Land Girls|                                               null|
|First Love, Last ...|                                               null|
|I Married a Stran...|                                               null|
|Let's Talk About Sex|                                               null|
|                Slam|                                                4.8|
| Mississippi Mermaid|                                               null|
|           Following|                                               null|
|             Foolish|                                               null|
|             Pirates|                                               4.15|
|     Duel in the Sun|                                                7.8|
|           Tom Jones|   

In [35]:
# can use ==, >=, >, <, <= to obtain boolean col objects
movies_df.select(\
    col("Title"),
    (col("Rotten_Tomatoes_Rating") == col("IMDB_Rating"))).show()


+--------------------+--------------------------------------+
|               Title|(Rotten_Tomatoes_Rating = IMDB_Rating)|
+--------------------+--------------------------------------+
|      The Land Girls|                                  null|
|First Love, Last ...|                                  null|
|I Married a Stran...|                                  null|
|Let's Talk About Sex|                                  null|
|                Slam|                                 false|
| Mississippi Mermaid|                                  null|
|           Following|                                  null|
|             Foolish|                                  null|
|             Pirates|                                 false|
|     Duel in the Sun|                                 false|
|           Tom Jones|                                 false|
|             Oliver!|                                 false|
|To Kill A Mocking...|                                 false|
|    Tor

In [36]:
# Pearson correlation - for numerical fields
# a number [-1, 1]
# is an "action" (the DF must be evaluated)
rating_correlation = movies_df.stat.corr("IMDB_Rating", "Rotten_Tomatoes_Rating")
print(rating_correlation)


0.4259708986248316


# String functions

In [38]:
# translate the first letter of each word to upper case in the sentence.
movies_df.select(initcap(col("Title"))).show()

+--------------------+
|      initcap(Title)|
+--------------------+
|      The Land Girls|
|First Love, Last ...|
|I Married A Stran...|
|Let's Talk About Sex|
|                Slam|
| Mississippi Mermaid|
|           Following|
|             Foolish|
|             Pirates|
|     Duel In The Sun|
|           Tom Jones|
|             Oliver!|
|To Kill A Mocking...|
|    Tora, Tora, Tora|
|   Hollywood Shuffle|
|Over The Hill To ...|
|              Wilson|
|        Darling Lili|
|The Ten Commandments|
|        12 Angry Men|
+--------------------+
only showing top 20 rows



In [39]:
movies_df.filter(col("Title").contains("love")).show(5, False)


+------------------+--------------+--------------------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+-------------------------+-----------+------------+--------+---------------+
|Creative_Type     |Director      |Distributor         |IMDB_Rating|IMDB_Votes|MPAA_Rating|Major_Genre|Production_Budget|Release_Date|Rotten_Tomatoes_Rating|Running_Time_min|Source                   |Title      |US_DVD_Sales|US_Gross|Worldwide_Gross|
+------------------+--------------+--------------------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+-------------------------+-----------+------------+--------+---------------+
|Science Fiction   |Matt Reeves   |Paramount Pictures  |7.4        |136068    |PG-13      |Action     |25000000         |18-Jan-08   |76                    |null            |Original Screenplay      |Cloverfield|29180398    |80048433|170764033    

# Regexes filtering

In [40]:
cars_df = spark.read.json("data/cars")

regexString = "volkswagen|vw"
vw_df = cars_df.select(
    col("Name"),
    regexp_extract(col("Name"), regexString, 0).alias("regex_extract")
).filter(col("regex_extract") != "")

vw_df.show(5, False)

+----------------------------+-------------+
|Name                        |regex_extract|
+----------------------------+-------------+
|volkswagen 1131 deluxe sedan|volkswagen   |
|volkswagen super beetle 117 |volkswagen   |
|volkswagen model 111        |volkswagen   |
|volkswagen type 3           |volkswagen   |
|volkswagen 411 (sw)         |volkswagen   |
+----------------------------+-------------+
only showing top 5 rows



In [41]:
vw_new_name_df = vw_df.select(
    col("Name"),
    regexp_replace(col("Name"), regexString, "Volkswagen").alias("replacement")
)
vw_new_name_df.show(5, False)

+----------------------------+----------------------------+
|Name                        |replacement                 |
+----------------------------+----------------------------+
|volkswagen 1131 deluxe sedan|Volkswagen 1131 deluxe sedan|
|volkswagen super beetle 117 |Volkswagen super beetle 117 |
|volkswagen model 111        |Volkswagen model 111        |
|volkswagen type 3           |Volkswagen type 3           |
|volkswagen 411 (sw)         |Volkswagen 411 (sw)         |
+----------------------------+----------------------------+
only showing top 5 rows



# Exercise

    Filter the cars DF, return all cars whose name contains either element of the list
    - contains function
    - regexes

In [44]:
def get_car_names():
    return ["Volkswagen", "Mercedes-Benz", "Ford"]

+----+-------------+
|Name|regex_extract|
+----+-------------+
+----+-------------+



+------------+---------+------------+----------+----------------+---------------------+------+-------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|Name                 |Origin|Weight_in_lbs|Year      |
+------------+---------+------------+----------+----------------+---------------------+------+-------------+----------+
|10.5        |8        |302.0       |140       |17.0            |ford torino          |USA   |3449         |1970-01-01|
|10.0        |8        |429.0       |198       |15.0            |ford galaxie 500     |USA   |4341         |1970-01-01|
|11.0        |8        |351.0       |153       |null            |ford torino (sw)     |USA   |4034         |1970-01-01|
|8.0         |8        |302.0       |140       |null            |ford mustang boss 302|USA   |3353         |1970-01-01|
|16.0        |6        |200.0       |85        |21.0            |ford maverick        |USA   |2587         |1970-01-01|
+------------+---------+------------+---

# Date type

In [118]:
# TODO Not working in Spark 2.4
# How to conver data
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

movies_with_release_dates_df = movies_df.select(
    col("Title"),
    to_date(col("Release_Date"), "dd-MMM-YY").alias("Actual_Release")
)

#     to_date(col("Release_Date"), "dd-MMM-YY").alias("Actual_Release")

movies_with_release_dates_df.show()

+--------------------+--------------+
|               Title|Actual_Release|
+--------------------+--------------+
|      The Land Girls|    1997-12-28|
|First Love, Last ...|    1997-12-28|
|I Married a Stran...|    1997-12-28|
|Let's Talk About Sex|    1997-12-28|
|                Slam|    1997-12-28|
| Mississippi Mermaid|    1998-12-27|
|           Following|    1998-12-27|
|             Foolish|    1998-12-27|
|             Pirates|    1985-12-29|
|     Duel in the Sun|    1945-12-30|
|           Tom Jones|    1962-12-30|
|             Oliver!|    1967-12-31|
|To Kill A Mocking...|    1961-12-31|
|    Tora, Tora, Tora|    1969-12-28|
|   Hollywood Shuffle|    1986-12-28|
|Over the Hill to ...|    2019-12-29|
|              Wilson|    1943-12-26|
|        Darling Lili|    1969-12-28|
|The Ten Commandments|    1956-01-01|
|        12 Angry Men|    1956-12-30|
+--------------------+--------------+
only showing top 20 rows



In [119]:
# date operations
enriched_movies_df = movies_with_release_dates_df. \
    withColumn("Today", current_date()). \
    withColumn("Right_Now", current_timestamp()). \
    withColumn("Movie_Age", datediff(col("Today"), col("Actual_Release")) / 365)


enriched_movies_df.show()

+--------------------+--------------+----------+--------------------+------------------+
|               Title|Actual_Release|     Today|           Right_Now|         Movie_Age|
+--------------------+--------------+----------+--------------------+------------------+
|      The Land Girls|    1997-12-28|2023-11-10|2023-11-10 15:32:...|25.884931506849316|
|First Love, Last ...|    1997-12-28|2023-11-10|2023-11-10 15:32:...|25.884931506849316|
|I Married a Stran...|    1997-12-28|2023-11-10|2023-11-10 15:32:...|25.884931506849316|
|Let's Talk About Sex|    1997-12-28|2023-11-10|2023-11-10 15:32:...|25.884931506849316|
|                Slam|    1997-12-28|2023-11-10|2023-11-10 15:32:...|25.884931506849316|
| Mississippi Mermaid|    1998-12-27|2023-11-10|2023-11-10 15:32:...|24.887671232876713|
|           Following|    1998-12-27|2023-11-10|2023-11-10 15:32:...|24.887671232876713|
|             Foolish|    1998-12-27|2023-11-10|2023-11-10 15:32:...|24.887671232876713|
|             Pirates

In [120]:
# check for empty date
no_release_known_df = movies_with_release_dates_df.filter(col("Actual_Release").isNull())
no_release_known_df.show()

+--------------------+--------------+
|               Title|Actual_Release|
+--------------------+--------------+
|   55 Days at Peking|          null|
|Alexander's Ragti...|          null|
|American Ninja 2:...|          null|
|       The Apartment|          null|
|        Barry Lyndon|          null|
|     Barbarians, The|          null|
|Battle for the Pl...|          null|
|Return to the Blu...|          null|
|       The Blue Bird|          null|
| The Broadway Melody|          null|
|           Boom Town|          null|
|      Bathing Beauty|          null|
|The Boys from Brazil|          null|
|         Coming Home|          null|
|Conquest of the P...|          null|
|            Caravans|          null|
|          Casablanca|          null|
|Can't Stop the Music|          null|
|      Donovan's Reef|          null|
|             Dolphin|          null|
+--------------------+--------------+
only showing top 20 rows



In [None]:
# hypothetical
movies_with_2_formats = movies_df.select(col("Title"), col("Release_Date")). \
    withColumn("Date_F1", to_date(col("Release_Date"), "dd-MM-yyyy")). \
    withColumn("Date_F2", to_date(col("Release_Date"), "yyyy-MM-dd")). \
    withColumn("Actual_Date", coalesce(col("Date_F1"), col("Date_F2")))

# Structures

In [45]:
# structures
print("structures create")
movies_struct_df = movies_df. \
    select(col("Title"), struct(col("US_Gross"), col("Worldwide_Gross"), col("US_DVD_Sales")).alias("Profit"))

movies_struct_df.show()


structures create
+--------------------+--------------------+
|               Title|              Profit|
+--------------------+--------------------+
|      The Land Girls|{146083, 146083, ...|
|First Love, Last ...|{10876, 10876, null}|
|I Married a Stran...|{203134, 203134, ...|
|Let's Talk About Sex|{373615, 373615, ...|
|                Slam|{1009819, 1087521...|
| Mississippi Mermaid|{24551, 2624551, ...|
|           Following|{44705, 44705, null}|
|             Foolish|{6026908, 6026908...|
|             Pirates|{1641825, 6341825...|
|     Duel in the Sun|{20400000, 204000...|
|           Tom Jones|{37600000, 376000...|
|             Oliver!|{37402877, 374028...|
|To Kill A Mocking...|{13129846, 131298...|
|    Tora, Tora, Tora|{29548291, 295482...|
|   Hollywood Shuffle|{5228617, 5228617...|
|Over the Hill to ...|{3000000, 3000000...|
|              Wilson|{2000000, 2000000...|
|        Darling Lili|{5000000, 5000000...|
|The Ten Commandments|{80000000, 800000...|
|        12 An

In [46]:
# get fields

movies_struct_df. \
    select(col("Title"), col("Profit").getField("US_Gross").alias("US_Profit")).\
    show()


+--------------------+---------+
|               Title|US_Profit|
+--------------------+---------+
|      The Land Girls|   146083|
|First Love, Last ...|    10876|
|I Married a Stran...|   203134|
|Let's Talk About Sex|   373615|
|                Slam|  1009819|
| Mississippi Mermaid|    24551|
|           Following|    44705|
|             Foolish|  6026908|
|             Pirates|  1641825|
|     Duel in the Sun| 20400000|
|           Tom Jones| 37600000|
|             Oliver!| 37402877|
|To Kill A Mocking...| 13129846|
|    Tora, Tora, Tora| 29548291|
|   Hollywood Shuffle|  5228617|
|Over the Hill to ...|  3000000|
|              Wilson|  2000000|
|        Darling Lili|  5000000|
|The Ten Commandments| 80000000|
|        12 Angry Men|        0|
+--------------------+---------+
only showing top 20 rows



In [35]:
# structures - SQL expression strings
movies_struct_df_v2 = movies_df. \
    selectExpr("Title", "(US_Gross, Worldwide_Gross, US_DVD_Sales) as Profit"). \
    selectExpr("Title", "Profit.US_Gross as US_Profit")

movies_struct_df_v2.show()

+--------------------+---------+
|               Title|US_Profit|
+--------------------+---------+
|      The Land Girls|   146083|
|First Love, Last ...|    10876|
|I Married a Stran...|   203134|
|Let's Talk About Sex|   373615|
|                Slam|  1009819|
| Mississippi Mermaid|    24551|
|           Following|    44705|
|             Foolish|  6026908|
|             Pirates|  1641825|
|     Duel in the Sun| 20400000|
|           Tom Jones| 37600000|
|             Oliver!| 37402877|
|To Kill A Mocking...| 13129846|
|    Tora, Tora, Tora| 29548291|
|   Hollywood Shuffle|  5228617|
|Over the Hill to ...|  3000000|
|              Wilson|  2000000|
|        Darling Lili|  5000000|
|The Ten Commandments| 80000000|
|        12 Angry Men|        0|
+--------------------+---------+
only showing top 20 rows



In [47]:
# very nested data structures
movies_struct_df_v3 = movies_df. \
    selectExpr("Title",
               "((IMDB_Rating, Rotten_Tomatoes_Rating) as Rating, (US_Gross, Worldwide_Gross, US_DVD_Sales) as Profit) as Success")
print("nested data structures")

movies_struct_df_v3.show()


nested data structures
+--------------------+--------------------+
|               Title|             Success|
+--------------------+--------------------+
|      The Land Girls|{{6.1, null}, {14...|
|First Love, Last ...|{{6.9, null}, {10...|
|I Married a Stran...|{{6.8, null}, {20...|
|Let's Talk About Sex|{{null, 13}, {373...|
|                Slam|{{3.4, 62}, {1009...|
| Mississippi Mermaid|{{null, null}, {2...|
|           Following|{{7.7, null}, {44...|
|             Foolish|{{3.8, null}, {60...|
|             Pirates|{{5.8, 25}, {1641...|
|     Duel in the Sun|{{7.0, 86}, {2040...|
|           Tom Jones|{{7.0, 81}, {3760...|
|             Oliver!|{{7.5, 84}, {3740...|
|To Kill A Mocking...|{{8.4, 97}, {1312...|
|    Tora, Tora, Tora|{{null, null}, {2...|
|   Hollywood Shuffle|{{6.8, 87}, {5228...|
|Over the Hill to ...|{{null, null}, {3...|
|              Wilson|{{7.0, null}, {20...|
|        Darling Lili|{{6.1, null}, {50...|
|The Ten Commandments|{{2.5, 90}, {8000...|
|        

In [48]:
movies_struct_df_v3. \
    selectExpr("Title", "Success.Rating.IMDB_Rating as IMDB").show()

movies_struct_df_v3.show()

+--------------------+----+
|               Title|IMDB|
+--------------------+----+
|      The Land Girls| 6.1|
|First Love, Last ...| 6.9|
|I Married a Stran...| 6.8|
|Let's Talk About Sex|null|
|                Slam| 3.4|
| Mississippi Mermaid|null|
|           Following| 7.7|
|             Foolish| 3.8|
|             Pirates| 5.8|
|     Duel in the Sun| 7.0|
|           Tom Jones| 7.0|
|             Oliver!| 7.5|
|To Kill A Mocking...| 8.4|
|    Tora, Tora, Tora|null|
|   Hollywood Shuffle| 6.8|
|Over the Hill to ...|null|
|              Wilson| 7.0|
|        Darling Lili| 6.1|
|The Ten Commandments| 2.5|
|        12 Angry Men| 8.9|
+--------------------+----+
only showing top 20 rows

+--------------------+--------------------+
|               Title|             Success|
+--------------------+--------------------+
|      The Land Girls|{{6.1, null}, {14...|
|First Love, Last ...|{{6.9, null}, {10...|
|I Married a Stran...|{{6.8, null}, {20...|
|Let's Talk About Sex|{{null, 13}, {37

# arrays

In [49]:

movies_with_words_df = movies_df.\
    select(col("Title"),
    split(col("Title"), " |,").alias("Title_Words"),
    split(col("Director"), " |,").alias("Director_Words"))

movies_with_words_df.printSchema()
movies_with_words_df.show()

# ^^^^^^^^^^^^^^^^^^^^^^^^ col object of type ARRAY[String]
# you can have nested arrays

root
 |-- Title: string (nullable = true)
 |-- Title_Words: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- Director_Words: array (nullable = true)
 |    |-- element: string (containsNull = false)

+--------------------+--------------------+--------------------+
|               Title|         Title_Words|      Director_Words|
+--------------------+--------------------+--------------------+
|      The Land Girls|  [The, Land, Girls]|                null|
|First Love, Last ...|[First, Love, , L...|                null|
|I Married a Stran...|[I, Married, a, S...|                null|
|Let's Talk About Sex|[Let's, Talk, Abo...|                null|
|                Slam|              [Slam]|                null|
| Mississippi Mermaid|[Mississippi, Mer...|                null|
|           Following|         [Following]|[Christopher, Nolan]|
|             Foolish|           [Foolish]|                null|
|             Pirates|           [Pirates]|   [Roman, Pol

In [50]:
# array operations
array_ops_df = movies_with_words_df.select(
    col("Title"),
    expr("Title_Words[0]"),  # the first element in the array
    size(col("Title_Words")),  # the length of the array
    array_contains(col("Title_Words"), "Love")
    # a bunch of array_(...) functions
)

array_ops_df.show()


+--------------------+--------------+-----------------+---------------------------------+
|               Title|Title_Words[0]|size(Title_Words)|array_contains(Title_Words, Love)|
+--------------------+--------------+-----------------+---------------------------------+
|      The Land Girls|           The|                3|                            false|
|First Love, Last ...|         First|                5|                             true|
|I Married a Stran...|             I|                5|                            false|
|Let's Talk About Sex|         Let's|                4|                            false|
|                Slam|          Slam|                1|                            false|
| Mississippi Mermaid|   Mississippi|                2|                            false|
|           Following|     Following|                1|                            false|
|             Foolish|       Foolish|                1|                            false|
|         

In [51]:
# Flat arrays

array_ops_df = movies_with_words_df.select(
    col("Title"),
    explode(col("Title_Words"))
)

array_ops_df.show()


+--------------------+-----------+
|               Title|        col|
+--------------------+-----------+
|      The Land Girls|        The|
|      The Land Girls|       Land|
|      The Land Girls|      Girls|
|First Love, Last ...|      First|
|First Love, Last ...|       Love|
|First Love, Last ...|           |
|First Love, Last ...|       Last|
|First Love, Last ...|      Rites|
|I Married a Stran...|          I|
|I Married a Stran...|    Married|
|I Married a Stran...|          a|
|I Married a Stran...|    Strange|
|I Married a Stran...|     Person|
|Let's Talk About Sex|      Let's|
|Let's Talk About Sex|       Talk|
|Let's Talk About Sex|      About|
|Let's Talk About Sex|        Sex|
|                Slam|       Slam|
| Mississippi Mermaid|Mississippi|
| Mississippi Mermaid|    Mermaid|
+--------------------+-----------+
only showing top 20 rows



In [112]:
business_df = spark.read.json("data/business")
assert(business_df.count() != 0)
#business_df.printSchema()

+-----------+--------------------+-------+---------+----------+
|business_id|        full_address|    day|open_time|close_time|
+-----------+--------------------+-------+---------+----------+
|        abs|      random_address| Monday|    10:00|     20:00|
|        abs|      random_address|Tuesday|    10:30|     19:30|
|       temp|another random_ad...| Friday|    10:00|     20:00|
+-----------+--------------------+-------+---------+----------+

