<a href="https://colab.research.google.com/github/jugalpanchal/bd-chef/blob/main/spark_etl_soccer_anlys_recipe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Follow the steps to install the dependencies:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # install java
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz # spark package download
!tar xf spark-3.1.2-bin-hadoop3.2.tgz # unzip spark package
!pip install -q findspark # install spark

# Set the location of Java and Spark:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# create or get spark session
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Spark_App1") \
        .getOrCreate()

sc = spark.sparkContext

In [3]:
players = spark.read\
            .format("csv")\
            .option("header", "true")\
            .load("datasets/player.csv")

players.show(5) #DataFrame count

+---+-------------+------------------+------------------+-------------------+------+------+
| id|player_api_id|       player_name|player_fifa_api_id|           birthday|height|weight|
+---+-------------+------------------+------------------+-------------------+------+------+
|  1|       505942|Aaron Appindangoye|            218353|1992-02-29 00:00:00|182.88|   187|
|  2|       155782|   Aaron Cresswell|            189615|1989-12-15 00:00:00|170.18|   146|
|  3|       162549|       Aaron Doran|            186170|1991-05-13 00:00:00|170.18|   163|
|  4|        30572|     Aaron Galindo|            140161|1982-05-08 00:00:00|182.88|   198|
|  5|        23780|      Aaron Hughes|             17725|1979-11-08 00:00:00|182.88|   154|
+---+-------------+------------------+------------------+-------------------+------+------+
only showing top 5 rows



In [4]:
player_attributes = spark.read\
            .format("csv")\
            .option("header", "true")\
            .load("datasets/player_attributes.csv")

player_attributes.show(5)

+---+------------------+-------------+-------------------+--------------+---------+--------------+-------------------+-------------------+--------+---------+----------------+-------------+-------+---------+-----+------------------+------------+------------+------------+------------+-------+---------+-------+----------+-------+-------+--------+----------+----------+-------------+-----------+------+---------+-------+---------------+--------------+---------+-----------+----------+--------------+-----------+
| id|player_fifa_api_id|player_api_id|               date|overall_rating|potential|preferred_foot|attacking_work_rate|defensive_work_rate|crossing|finishing|heading_accuracy|short_passing|volleys|dribbling|curve|free_kick_accuracy|long_passing|ball_control|acceleration|sprint_speed|agility|reactions|balance|shot_power|jumping|stamina|strength|long_shots|aggression|interceptions|positioning|vision|penalties|marking|standing_tackle|sliding_tackle|gk_diving|gk_handling|gk_kicking|gk_posit

In [5]:
players.count(), player_attributes.count()

(11060, 183978)

### Join

In [6]:
# join transfers data to other worker nodes to join the two data sets

# set1.join(set2, on_join_condition)
player_details = players.join(player_attributes, players.player_api_id == player_attributes.player_api_id)
player_details.show() # player_api_id is two times in the result dataframe.

+---+-------------+------------------+------------------+-------------------+------+------+---+------------------+-------------+-------------------+--------------+---------+--------------+-------------------+-------------------+--------+---------+----------------+-------------+-------+---------+-----+------------------+------------+------------+------------+------------+-------+---------+-------+----------+-------+-------+--------+----------+----------+-------------+-----------+------+---------+-------+---------------+--------------+---------+-----------+----------+--------------+-----------+
| id|player_api_id|       player_name|player_fifa_api_id|           birthday|height|weight| id|player_fifa_api_id|player_api_id|               date|overall_rating|potential|preferred_foot|attacking_work_rate|defensive_work_rate|crossing|finishing|heading_accuracy|short_passing|volleys|dribbling|curve|free_kick_accuracy|long_passing|ball_control|acceleration|sprint_speed|agility|reactions|balance|s

In [7]:
player_details = players.join(player_attributes, ['player_api_id']) # we are passing the join columns as a list.

player_details.show() # player_api_id is only one time in the result dataframe.

+-------------+---+------------------+------------------+-------------------+------+------+---+------------------+-------------------+--------------+---------+--------------+-------------------+-------------------+--------+---------+----------------+-------------+-------+---------+-----+------------------+------------+------------+------------+------------+-------+---------+-------+----------+-------+-------+--------+----------+----------+-------------+-----------+------+---------+-------+---------------+--------------+---------+-----------+----------+--------------+-----------+
|player_api_id| id|       player_name|player_fifa_api_id|           birthday|height|weight| id|player_fifa_api_id|               date|overall_rating|potential|preferred_foot|attacking_work_rate|defensive_work_rate|crossing|finishing|heading_accuracy|short_passing|volleys|dribbling|curve|free_kick_accuracy|long_passing|ball_control|acceleration|sprint_speed|agility|reactions|balance|shot_power|jumping|stamina|st

In [8]:
# Join datasets

valuesA = [('John', 100000), ('James', 150000), ('Emily', 65000), ('Nina', 200000)]
tableA = spark.createDataFrame(valuesA, ['name', 'salary'])

valuesB = [('James', 2), ('Emily',3), ('Darth Vader', 5), ('Princess Leia', 6),]
tableB = spark.createDataFrame(valuesB, ['name', 'employee_id'])

tableA.show()
tableB.show()

+-----+------+
| name|salary|
+-----+------+
| John|100000|
|James|150000|
|Emily| 65000|
| Nina|200000|
+-----+------+

+-------------+-----------+
|         name|employee_id|
+-------------+-----------+
|        James|          2|
|        Emily|          3|
|  Darth Vader|          5|
|Princess Leia|          6|
+-------------+-----------+



In [9]:
inner_join = tableA.join(tableB, tableA.name == tableB.name)
inner_join.show()

left_join = tableA.join(tableB, tableA.name == tableB.name, how='left') 
left_join.show()

right_join = tableA.join(tableB, tableA.name == tableB.name, how='right') 
right_join.show()

full_outer_join = tableA.join(tableB, tableA.name == tableB.name, how='full')
full_outer_join.show()

+-----+------+-----+-----------+
| name|salary| name|employee_id|
+-----+------+-----+-----------+
|James|150000|James|          2|
|Emily| 65000|Emily|          3|
+-----+------+-----+-----------+

+-----+------+-----+-----------+
| name|salary| name|employee_id|
+-----+------+-----+-----------+
|James|150000|James|          2|
| John|100000| null|       null|
|Emily| 65000|Emily|          3|
| Nina|200000| null|       null|
+-----+------+-----+-----------+

+-----+------+-------------+-----------+
| name|salary|         name|employee_id|
+-----+------+-------------+-----------+
|James|150000|        James|          2|
| null|  null|Princess Leia|          6|
|Emily| 65000|        Emily|          3|
| null|  null|  Darth Vader|          5|
+-----+------+-------------+-----------+

+-----+------+-------------+-----------+
| name|salary|         name|employee_id|
+-----+------+-------------+-----------+
|James|150000|        James|          2|
| John|100000|         null|       null|
| 

### Broadcast Variables

In [10]:
# General join transfers data to other worker nodes to join with the another data sets(on another workder node already).
# So it is costly if one dataset is smaller and shuffled acrossed workder nodes.
# So the broadcast variable solves the issue.

# The players set is smaller so we can broadcast it to the worker nodes.
from pyspark.sql.functions import broadcast

player_details = player_attributes.join(broadcast(players), ['player_api_id'], 'inner')
player_details.show()

+-------------+---+------------------+-------------------+--------------+---------+--------------+-------------------+-------------------+--------+---------+----------------+-------------+-------+---------+-----+------------------+------------+------------+------------+------------+-------+---------+-------+----------+-------+-------+--------+----------+----------+-------------+-----------+------+---------+-------+---------------+--------------+---------+-----------+----------+--------------+-----------+---+------------------+------------------+-------------------+------+------+
|player_api_id| id|player_fifa_api_id|               date|overall_rating|potential|preferred_foot|attacking_work_rate|defensive_work_rate|crossing|finishing|heading_accuracy|short_passing|volleys|dribbling|curve|free_kick_accuracy|long_passing|ball_control|acceleration|sprint_speed|agility|reactions|balance|shot_power|jumping|stamina|strength|long_shots|aggression|interceptions|positioning|vision|penalties|mark

### Accumulator


In [11]:
short_height = spark.sparkContext.accumulator(0) # default value is 0 here
tall_height = spark.sparkContext.accumulator(0)

def count_players_by_height(row):
  height =float(row.height)

  if(height <= 175):
    short_height.add(1)
  else:
    tall_height.add(1)

In [12]:
# foreach loop works in distributed fasion. But all dataframes on multiple worker nodes updates the single accumulator.
player_details.foreach(lambda x: count_players_by_height(x))

short_height.value, tall_height.value

(19204, 164774)

### UDF

In [13]:
# The udf function is executed with distributed fasion.
from pyspark.sql.functions import udf

# multiple tasks are created and each is assigned with this udf.
year_extract_udf = udf(lambda date: date.split('-')[0])

player_attributes = player_attributes.withColumn('year', year_extract_udf(player_attributes.date))

player_attributes.show(5)

+---+------------------+-------------+-------------------+--------------+---------+--------------+-------------------+-------------------+--------+---------+----------------+-------------+-------+---------+-----+------------------+------------+------------+------------+------------+-------+---------+-------+----------+-------+-------+--------+----------+----------+-------------+-----------+------+---------+-------+---------------+--------------+---------+-----------+----------+--------------+-----------+----+
| id|player_fifa_api_id|player_api_id|               date|overall_rating|potential|preferred_foot|attacking_work_rate|defensive_work_rate|crossing|finishing|heading_accuracy|short_passing|volleys|dribbling|curve|free_kick_accuracy|long_passing|ball_control|acceleration|sprint_speed|agility|reactions|balance|shot_power|jumping|stamina|strength|long_shots|aggression|interceptions|positioning|vision|penalties|marking|standing_tackle|sliding_tackle|gk_diving|gk_handling|gk_kicking|gk_