* UDF
* Join
* Windowing
- How a spark script is submitted

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('soccer_player').getOrCreate()

In [None]:
player_file = '../datasets/player.csv'
player_attr_file = '../datasets/player_attributes.csv'



In [83]:
player_df = spark.read.format('csv').option('header','true').load(player_file)

player_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- player_api_id: string (nullable = true)
 |-- player_name: string (nullable = true)
 |-- player_fifa_api_id: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- height: string (nullable = true)
 |-- weight: string (nullable = true)



In [None]:
pa_df = spark.read.csv(player_attr_file, header=True)
pa_df.printSchema()

In [None]:
player_df.count(), pa_df.count()

In [None]:
player_df.show(5)

## UDF

In [None]:
from pyspark.sql.functions import udf

In [None]:
pa_df.select('date').show(5)

In [26]:
udf_year_extract = udf(lambda c: c.split('-')[0])
#pa_df = pa_df.withColumn('year', udf_year_extract(pa_df.date)).drop('date')
sorted(pa_df.columns)

['acceleration',
 'aggression',
 'agility',
 'attacking_work_rate',
 'balance',
 'ball_control',
 'crossing',
 'curve',
 'defensive_work_rate',
 'dribbling',
 'finishing',
 'free_kick_accuracy',
 'gk_diving',
 'gk_handling',
 'gk_kicking',
 'gk_positioning',
 'gk_reflexes',
 'heading_accuracy',
 'id',
 'interceptions',
 'jumping',
 'long_passing',
 'long_shots',
 'marking',
 'overall_rating',
 'penalties',
 'player_api_id',
 'player_fifa_api_id',
 'positioning',
 'potential',
 'preferred_foot',
 'reactions',
 'short_passing',
 'shot_power',
 'sliding_tackle',
 'sprint_speed',
 'stamina',
 'standing_tackle',
 'strength',
 'vision',
 'volleys',
 'year']

# Find best striker of 2016

In [27]:
pa_2016 = pa_df.where(pa_df.year == 2016)

In [28]:
pa_strikers = pa_2016.groupBy('player_api_id').agg(
                                                    {'shot_power':'avg',
                                                    'finishing':'avg',
                                                    'acceleration':'avg'
                                                    })


In [29]:
pa_strikers.show(5)

+-------------+-----------------+-----------------+---------------+
|player_api_id|   avg(finishing)|avg(acceleration)|avg(shot_power)|
+-------------+-----------------+-----------------+---------------+
|       309726|75.44444444444444|74.11111111111111|           76.0|
|        26112|             53.0|             51.0|           76.0|
|        38433|            68.25|             74.0|           74.0|
|       295060|             25.0|             62.0|           40.0|
|       161396|             29.0|             72.0|           69.0|
+-------------+-----------------+-----------------+---------------+
only showing top 5 rows



### 1. Join with player information in player_df

In [None]:
joined_striker = player_df.select('player_api_id', 'player_name')\
                .join(pa_strikers, on=player_df.player_api_id == pa_strikers.player_api_id)
joined_striker.show(5)

Another way. Avoids repetition of key

In [74]:
joined_striker = player_df.select('player_api_id', 'player_name')\
                .join(pa_strikers, on=['player_api_id'])
joined_striker.show(5)

+-------------+--------------+-----------------+-----------------+---------------+
|player_api_id|   player_name|   avg(finishing)|avg(acceleration)|avg(shot_power)|
+-------------+--------------+-----------------+-----------------+---------------+
|       309726|Andrea Belotti|75.44444444444444|74.11111111111111|           76.0|
|        26112|Benoit Cheyrou|             53.0|             51.0|           76.0|
|        38433|  Borja Valero|            68.25|             74.0|           74.0|
|       295060|  Daniel Potts|             25.0|             62.0|           40.0|
|       161396|Diego Contento|             29.0|             72.0|           69.0|
+-------------+--------------+-----------------+-----------------+---------------+
only showing top 5 rows



### 2. Broadcast join
Broadcast smaller dataframe to make a copy on every executor

In [85]:
udf_overall_score = udf(lambda f,a,s : (f+a+2*s)/4)
pa_strikers_2016 = pa_strikers.withColumn('total_score', udf_overall_score('avg(finishing)','avg(acceleration)','avg(shot_power)'))\
                .drop('avg(finishing)','avg(acceleration)','avg(shot_power)')

pa_strikers_2016.show(5)

+-------------+-----------------+
|player_api_id|      total_score|
+-------------+-----------------+
|       309726|75.38888888888889|
|        26112|             64.0|
|        38433|          72.5625|
|       295060|            41.75|
|       161396|            59.75|
+-------------+-----------------+
only showing top 5 rows



In [86]:
pa_strikers_2016.count(), player_df.count()

(5586, 11060)

In [88]:
from pyspark.sql.functions import broadcast
joined_striker = player_df.select(
                                'player_api_id', 
                                'player_name')\
                        .join(
                                broadcast(pa_strikers_2016),
                                on=['player_api_id'],
                                how='inner')
joined_striker.show(5)

+-------------+------------------+-----------------+
|player_api_id|       player_name|      total_score|
+-------------+------------------+-----------------+
|       505942|Aaron Appindangoye|             53.5|
|       155782|   Aaron Cresswell|68.41666666666666|
|       162549|       Aaron Doran|            64.25|
|        30572|     Aaron Galindo|             43.0|
|        27316|        Aaron Hunt|            74.75|
+-------------+------------------+-----------------+
only showing top 5 rows



In [90]:
joined_striker.sort(joined_striker.total_score.desc()).show()

+-------------+--------------------+-----------------+
|player_api_id|         player_name|      total_score|
+-------------+--------------------+-----------------+
|        20276|                Hulk|            89.25|
|        37412|       Sergio Aguero|             89.0|
|        38817|        Carlos Tevez|            88.75|
|        32118|      Lukas Podolski|            88.25|
|        31921|         Gareth Bale|             87.0|
|        30834|        Arjen Robben|            86.75|
|       303824|       Memphis Depay|85.10714285714286|
|       129944|          Marco Reus|             85.0|
|       158263|        Dorlan Pabon|            84.75|
|       150565|Pierre-Emerick Au...|            84.75|
|        25759|     Gonzalo Higuain|84.66666666666667|
|       156726|       Douglas Costa|             84.5|
|       169193| Alexandre Lacazette|          84.4375|
|       286119|         Jamie Vardy|84.42857142857143|
|        30348|       Jermain Defoe|           84.375|
|        4

## Find if height is related to heading accuracy of players

#### 1. join heading accuracy with player height and create a dataframe

In [91]:
pa_df.count(),player_df.count()

(183978, 11060)

In [93]:
player_heading_acc = pa_df.select('player_api_id','heading_accuracy')\
                        .join(
                            broadcast(player_df.select('player_api_id','height')), 
                            on=['player_api_id']
                            )

In [128]:
player_heading_acc = player_heading_acc.dropna()
player_heading_acc.show(5)

+-------------+----------------+------+
|player_api_id|heading_accuracy|height|
+-------------+----------------+------+
|       505942|              71|182.88|
|       505942|              71|182.88|
|       505942|              71|182.88|
|       505942|              70|182.88|
|       505942|              70|182.88|
+-------------+----------------+------+
only showing top 5 rows



## Accumulators

### Height

In [129]:
short_count = spark.sparkContext.accumulator(0)
medium_low_count = spark.sparkContext.accumulator(0)
medium_high_count = spark.sparkContext.accumulator(0)
high_count = spark.sparkContext.accumulator(0)

In [130]:
def count_players_by_height(r):
    height = float(r.height)
    if height<175:
        short_count.add(1)
    elif height<183:
        medium_low_count.add(1)
    elif height<195:
        medium_high_count.add(1)
    else:
        high_count.add(1)

In [131]:
player_heading_acc.foreach(count_players_by_height)

In [132]:
short_count.value, medium_low_count.value, medium_high_count.value, high_count.value

(19128, 98541, 62081, 3392)

### accuracy

In [133]:
short_ha_count = spark.sparkContext.accumulator(0)
medium_low_ha_count = spark.sparkContext.accumulator(0)
medium_high_ha_count = spark.sparkContext.accumulator(0)
high_ha_count = spark.sparkContext.accumulator(0)

In [134]:
def count_player_acc_count_by_height(r, threshold):
    height = float(r.height)
    acc= float(r.heading_accuracy)
    
    if acc< threshold:
        return
    if height<175:
        short_ha_count.add(1)
    elif height<183:
        medium_low_ha_count.add(1)
    elif height<195:
        medium_high_ha_count.add(1)
    else:
        high_ha_count.add(1)

In [135]:
# calculate player count with >60% accuracy
player_heading_acc.foreach(lambda x: count_player_acc_count_by_height(x,60))

In [137]:
short_ha_count.value,medium_low_ha_count.value,medium_high_ha_count.value,high_ha_count.value

(4290, 46086, 41785, 1580)

In [138]:
[short_ha_count.value/short_count.value*100,
 medium_low_ha_count.value/medium_low_count.value*100,
 medium_high_ha_count.value/medium_high_count.value*100,
 high_ha_count.value/high_count.value*100]

[22.42785445420326, 46.76835022985356, 67.30722765419371, 46.58018867924528]