In [1]:
from time import sleep

from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
from pyspark.sql.functions import *

import findspark
findspark.init()

# SparkSession is the entry point for the HIGH-LEVEL API (DataFrames, Spark SQL)
spark = SparkSession. \
    builder. \
    config("spark.sql.autoBroadcastJoinThreshold", 0). \
    appName("Joins"). \
    master("local"). \
    getOrCreate()

In [2]:
movies_df = spark.read. \
    format("json"). \
    option("inferSchema", "true"). \
    load("data/movies")

In [9]:
# 1
# what's wrong with a SinglePartition
# how to add column with row_num() and count()
# read.parquet.count use schema

whole_dataset = Window.partitionBy().orderBy(col("Title").asc_nulls_last())

single_part_df = movies_df.select(col("Title"), row_number().over(whole_dataset))
#single_part_df = movies_df.select(col("Title"))\
#    .withColumn("row_number", monotonically_increasing_id())
single_part_df.explain()
single_part_df.show()


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Window [row_number() windowspecdefinition(Title#20 ASC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_number() OVER (ORDER BY Title ASC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#124], [Title#20 ASC NULLS LAST]
   +- Sort [Title#20 ASC NULLS LAST], false, 0
      +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=157]
         +- FileScan json [Title#20] Batched: false, DataFilters: [], Format: JSON, Location: InMemoryFileIndex(1 paths)[file:/D:/private/workspace/spark-cluster/src/main/resources/notebooks/..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Title:string>


+--------------------+--------------------------------------------------------------------------------------------------+
|               Title|row_number() OVER (ORDER BY Title ASC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)|
+--------------------+---------------

In [10]:
non_single_part_df = movies_df.select(col("Title"), monotonically_increasing_id())
non_single_part_df.explain()
single_part_df.sample(0.1).show()


== Physical Plan ==
*(1) Project [Title#20, monotonically_increasing_id() AS monotonically_increasing_id()#137L]
+- FileScan json [Title#20] Batched: false, DataFilters: [], Format: JSON, Location: InMemoryFileIndex(1 paths)[file:/D:/private/workspace/spark-cluster/src/main/resources/notebooks/..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Title:string>


+--------------------+--------------------------------------------------------------------------------------------------+
|               Title|row_number() OVER (ORDER BY Title ASC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)|
+--------------------+--------------------------------------------------------------------------------------------------+
|         10,000 B.C.|                                                                                                 1|
|          16 to Life|                                                                                                11|
|20,000 League

In [11]:
# 2
# How to read all data from cache?
# Partial caching - cashing only parts which were calculated by some action. That is the couse that part of data
# was from cache the other from source.

partition_of_100_df = spark.range(0, 10000, 1, 100)
partition_of_100_df.cache()


DataFrame[id: bigint]

In [12]:
# use only one partition, use only one partition FRACTION CACHE 1% - http://localhost:4040/storage/
# consistence can be uncorrected USE .count to put all data to cache
# deserialized - as Java object, serialized - as Array[Byte]

# partition_of_100_df.show(1)

partition_of_100_df.count()
partition_of_100_df.show(1)


+---+
| id|
+---+
|  0|
+---+
only showing top 1 row



In [13]:
# show data on local disk and disk spil
# InMemoryRelation - load data to cache

partition_of_100_df.explain()
# InMemoryTableScn - load data to cache


== Physical Plan ==
InMemoryTableScan [id#150L]
   +- InMemoryRelation [id#150L], StorageLevel(disk, memory, deserialized, 1 replicas)
         +- *(1) Range (0, 10000, step=1, splits=100)




# 4 Join optimisation

In [14]:
# dataframe of facts

crime_facts = spark \
    .read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("data/crimes/crime.csv")

check = crime_facts.cache().count()
assert(check != 0)

In [15]:
# Catalyst optimiser - move filter up

grouped_crime_df = crime_facts.\
    groupBy(col("OFFENSE_CODE")).\
    count().\
    filter(col("OFFENSE_CODE") == 1402)

grouped_crime_df.explain(True)
grouped_crime_df.show()

== Parsed Logical Plan ==
'Filter ('OFFENSE_CODE = 1402)
+- Aggregate [OFFENSE_CODE#246], [OFFENSE_CODE#246, count(1) AS count#846L]
   +- Relation [INCIDENT_NUMBER#245,OFFENSE_CODE#246,OFFENSE_CODE_GROUP#247,OFFENSE_DESCRIPTION#248,DISTRICT#249,REPORTING_AREA#250,SHOOTING#251,OCCURRED_ON_DATE#252,YEAR#253,MONTH#254,DAY_OF_WEEK#255,HOUR#256,UCR_PART#257,STREET#258,Lat#259,Long#260,Location#261] csv

== Analyzed Logical Plan ==
OFFENSE_CODE: int, count: bigint
Filter (OFFENSE_CODE#246 = 1402)
+- Aggregate [OFFENSE_CODE#246], [OFFENSE_CODE#246, count(1) AS count#846L]
   +- Relation [INCIDENT_NUMBER#245,OFFENSE_CODE#246,OFFENSE_CODE_GROUP#247,OFFENSE_DESCRIPTION#248,DISTRICT#249,REPORTING_AREA#250,SHOOTING#251,OCCURRED_ON_DATE#252,YEAR#253,MONTH#254,DAY_OF_WEEK#255,HOUR#256,UCR_PART#257,STREET#258,Lat#259,Long#260,Location#261] csv

== Optimized Logical Plan ==
Aggregate [OFFENSE_CODE#246], [OFFENSE_CODE#246, count(1) AS count#846L]
+- Project [OFFENSE_CODE#246]
   +- Filter (isnotnull(O

In [16]:
# small table with dict data
offense_сodes = spark.\
    read.\
    option("header", "true").\
    option("inferSchema", "true").\
    csv("data/crimes/offense_codes.csv")

offense_сodes.count()

assert(offense_сodes.count() == 576)

offense_сodes.show(100, False)


+----+----------------------------------------------------------+
|CODE|NAME                                                      |
+----+----------------------------------------------------------+
|612 |LARCENY PURSE SNATCH - NO FORCE                           |
|613 |LARCENY SHOPLIFTING                                       |
|615 |LARCENY THEFT OF MV PARTS & ACCESSORIES                   |
|1731|INCEST                                                    |
|3111|LICENSE PREMISE VIOLATION                                 |
|2646|LIQUOR - DRINKING IN PUBLIC                               |
|2204|LIQUOR LAW VIOLATION                                      |
|3810|M/V ACCIDENT - INVOLVING �BICYCLE - INJURY                |
|3801|M/V ACCIDENT - OTHER                                      |
|3807|M/V ACCIDENT - OTHER CITY VEHICLE                         |
|3803|M/V ACCIDENT - PERSONAL INJURY                            |
|3805|M/V ACCIDENT - POLICE VEHICLE                             |
|3802|M/V 

In [17]:
# Sort merge join example
rob_sort_merge_df = crime_facts.\
    join(offense_сodes, col("CODE") == col("OFFENSE_CODE")).\
    filter(col("NAME").startswith("ROBBERY")).\
    groupBy(col("NAME")).\
    count().\
    orderBy(col("count").desc())


rob_sort_merge_df.explain(True)
rob_sort_merge_df.show()

== Parsed Logical Plan ==
'Sort ['count DESC NULLS LAST], true
+- Aggregate [NAME#1559], [NAME#1559, count(1) AS count#1646L]
   +- Filter StartsWith(NAME#1559, ROBBERY)
      +- Join Inner, (CODE#1558 = OFFENSE_CODE#246)
         :- Relation [INCIDENT_NUMBER#245,OFFENSE_CODE#246,OFFENSE_CODE_GROUP#247,OFFENSE_DESCRIPTION#248,DISTRICT#249,REPORTING_AREA#250,SHOOTING#251,OCCURRED_ON_DATE#252,YEAR#253,MONTH#254,DAY_OF_WEEK#255,HOUR#256,UCR_PART#257,STREET#258,Lat#259,Long#260,Location#261] csv
         +- Relation [CODE#1558,NAME#1559] csv

== Analyzed Logical Plan ==
NAME: string, count: bigint
Sort [count#1646L DESC NULLS LAST], true
+- Aggregate [NAME#1559], [NAME#1559, count(1) AS count#1646L]
   +- Filter StartsWith(NAME#1559, ROBBERY)
      +- Join Inner, (CODE#1558 = OFFENSE_CODE#246)
         :- Relation [INCIDENT_NUMBER#245,OFFENSE_CODE#246,OFFENSE_CODE_GROUP#247,OFFENSE_DESCRIPTION#248,DISTRICT#249,REPORTING_AREA#250,SHOOTING#251,OCCURRED_ON_DATE#252,YEAR#253,MONTH#254,DAY_OF_W

In [18]:
# Broadcast Join Comparing

rob_broadcast_df = crime_facts.\
    join(broadcast(offense_сodes), col("CODE") == col("OFFENSE_CODE")).\
    filter(col("NAME").startswith("ROBBERY")).\
    groupBy(col("NAME")).\
    count().\
    orderBy(col("count").desc())

rob_broadcast_df.explain(True)
rob_broadcast_df.show()

== Parsed Logical Plan ==
'Sort ['count DESC NULLS LAST], true
+- Aggregate [NAME#1559], [NAME#1559, count(1) AS count#2400L]
   +- Filter StartsWith(NAME#1559, ROBBERY)
      +- Join Inner, (CODE#1558 = OFFENSE_CODE#246)
         :- Relation [INCIDENT_NUMBER#245,OFFENSE_CODE#246,OFFENSE_CODE_GROUP#247,OFFENSE_DESCRIPTION#248,DISTRICT#249,REPORTING_AREA#250,SHOOTING#251,OCCURRED_ON_DATE#252,YEAR#253,MONTH#254,DAY_OF_WEEK#255,HOUR#256,UCR_PART#257,STREET#258,Lat#259,Long#260,Location#261] csv
         +- ResolvedHint (strategy=broadcast)
            +- Relation [CODE#1558,NAME#1559] csv

== Analyzed Logical Plan ==
NAME: string, count: bigint
Sort [count#2400L DESC NULLS LAST], true
+- Aggregate [NAME#1559], [NAME#1559, count(1) AS count#2400L]
   +- Filter StartsWith(NAME#1559, ROBBERY)
      +- Join Inner, (CODE#1558 = OFFENSE_CODE#246)
         :- Relation [INCIDENT_NUMBER#245,OFFENSE_CODE#246,OFFENSE_CODE_GROUP#247,OFFENSE_DESCRIPTION#248,DISTRICT#249,REPORTING_AREA#250,SHOOTING#251

# Shared variables

In [19]:
sc = spark.sparkContext
accum = sc.accumulator(0)

sc.parallelize([1, 2, 3, 4]).foreach(lambda x: accum.add(x))

accum.value

10

In [6]:
broadcastVar = sc.broadcast([1, 2, 3])
broadcastVar.value

[1, 2, 3]