# RDD-vs-DataFrame

In [1]:
# Making sure to link pyspark to the right Spark folder with findspark
import findspark
import time
from functools import wraps
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count
findspark.init('/opt/spark')

In [2]:
conf = SparkConf().setAppName("rdd-vs-dataframe")
sc = SparkContext(conf=conf)

In [3]:
! hadoop fs -put ../datasets/f1

23/08/02 00:09:39 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
F1_RESULTS_PATH = "hdfs://node-master:9000/user/root/f1/results.csv"

In [5]:
F1_DRIVERS_PATH = "hdfs://node-master:9000/user/root/f1/drivers.csv"

## RDD

In [6]:
res_lines = sc.textFile(F1_RESULTS_PATH)

In [8]:
res_lines.collect()

['resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId',
 '1,18,1,1,22,1,1,1,1,10,58,34:50.6,5690616,39,2,01:27.5,218.3,1',
 '2,18,2,2,3,5,2,2,2,8,58,5.478,5696094,41,3,01:27.7,217.586,1',
 '3,18,3,3,7,7,3,3,3,6,58,8.163,5698779,41,5,01:28.1,216.719,1',
 '4,18,4,4,5,11,4,4,4,5,58,17.181,5707797,58,7,01:28.6,215.464,1',
 '5,18,5,1,23,3,5,5,5,4,58,18.014,5708630,43,1,01:27.4,218.385,1',
 '6,18,6,3,8,13,6,6,6,3,57,,,50,14,01:29.6,212.974,11',
 '7,18,7,5,14,17,7,7,7,2,55,,,22,12,01:29.5,213.224,5',
 '8,18,8,6,1,15,8,8,8,1,53,,,20,4,01:27.9,217.18,5',
 '9,18,9,2,4,2,,R,9,0,47,,,15,9,01:28.8,215.1,4',
 '10,18,10,7,12,18,,R,10,0,43,,,23,13,01:29.6,213.166,3',
 '11,18,11,8,18,19,,R,11,0,32,,,24,15,01:30.9,210.038,7',
 '12,18,12,4,6,20,,R,12,0,30,,,20,16,01:31.4,208.907,8',
 '13,18,13,6,2,4,,R,13,0,29,,,23,6,01:28.2,216.51,5',
 '14,18,14,9,9,8,,R,14,0,25,,,21,11,01:29.5,213.3,

In [12]:
type(res_lines.take(5))

list

In [22]:
res_lines.take(5)[3]

'3,18,3,3,7,7,3,3,3,6,58,8.163,5698779,41,5,01:28.1,216.719,1'

In [23]:
res_header = res_lines.take(1)[0].split(",")

In [24]:
res_header

['resultId',
 'raceId',
 'driverId',
 'constructorId',
 'number',
 'grid',
 'position',
 'positionText',
 'positionOrder',
 'points',
 'laps',
 'time',
 'milliseconds',
 'fastestLap',
 'rank',
 'fastestLapTime',
 'fastestLapSpeed',
 'statusId']

In [34]:
drivers_num_races = res_lines.filter(
    lambda line : line != ",".join(res_header)             # skipping header line
).map(
    lambda line : tuple(zip(res_header, line.split(",")))  # transforming values into labeled values
).map(
    lambda result : (int(result[2][1]), 1)                 # transforming labeled values into (driverId, 1)
).countByKey()                                             # counting drivers show up

In [35]:
drivers_num_races

defaultdict(int,
            {1: 208,
             2: 184,
             3: 206,
             4: 293,
             5: 112,
             6: 36,
             7: 27,
             8: 273,
             9: 76,
             10: 95,
             11: 91,
             12: 28,
             13: 271,
             14: 247,
             15: 256,
             16: 128,
             17: 217,
             18: 309,
             19: 24,
             20: 199,
             21: 231,
             22: 326,
             23: 180,
             24: 81,
             25: 69,
             26: 28,
             27: 46,
             28: 1,
             29: 21,
             30: 308,
             31: 95,
             32: 51,
             33: 37,
             34: 4,
             35: 165,
             36: 7,
             37: 107,
             38: 11,
             39: 48,
             40: 11,
             41: 38,
             42: 20,
             43: 28,
             44: 158,
             45: 14,
             46: 18,
         

In [None]:
# (key, val) / (1, "Joao", "Pinheiro", 34, ..) ||| (("resultId", "473"), ("raceId", "68"), ("driverId", "55"), ...)
# int(result[2][1]) => ("driverId", "55") => 55
# (55, 1)
# (55, 1)
# (66, 1)
# (89, 1)
# ...

In [None]:
# lambda line : line+=1 <=> def inc(line): line+=1

In [37]:
top_10_drivers_most_races = sorted(drivers_num_races.items(), key=lambda elem : -elem[1])[:10]

In [38]:
top_10_drivers_most_races

[(22, 326),
 (18, 309),
 (30, 308),
 (4, 293),
 (8, 273),
 (13, 271),
 (119, 257),
 (15, 256),
 (14, 247),
 (21, 231)]

In [39]:
drivers_lines = sc.textFile(F1_DRIVERS_PATH)

In [40]:
drivers_header = drivers_lines.take(1)[0].split(",")

In [41]:
drivers_header

['driverId',
 'driverRef',
 'number',
 'code',
 'forename',
 'surname',
 'dob',
 'nationality',
 'url']

In [42]:
translate_driver_id = drivers_lines.filter(
    lambda line : line != ",".join(drivers_header)             # skipping header line
).map(
    lambda line : tuple(zip(drivers_header, line.split(",")))  # transforming values into labeled values
).map(
    lambda result : (int(result[0][1]), result[1][1])          # transforming labeled values into (driverId, driverRef)
).collect()

In [43]:
translate_driver_id

[(1, 'hamilton'),
 (2, 'heidfeld'),
 (3, 'rosberg'),
 (4, 'alonso'),
 (5, 'kovalainen'),
 (6, 'nakajima'),
 (7, 'bourdais'),
 (8, 'raikkonen'),
 (9, 'kubica'),
 (10, 'glock'),
 (11, 'sato'),
 (12, 'piquet_jr'),
 (13, 'massa'),
 (14, 'coulthard'),
 (15, 'trulli'),
 (16, 'sutil'),
 (17, 'webber'),
 (18, 'button'),
 (19, 'davidson'),
 (20, 'vettel'),
 (21, 'fisichella'),
 (22, 'barrichello'),
 (23, 'ralf_schumacher'),
 (24, 'liuzzi'),
 (25, 'wurz'),
 (26, 'speed'),
 (27, 'albers'),
 (28, 'markus_winkelhock'),
 (29, 'yamamoto'),
 (30, 'michael_schumacher'),
 (31, 'montoya'),
 (32, 'klien'),
 (33, 'monteiro'),
 (34, 'ide'),
 (35, 'villeneuve'),
 (36, 'montagny'),
 (37, 'rosa'),
 (38, 'doornbos'),
 (39, 'karthikeyan'),
 (40, 'friesacher'),
 (41, 'zonta'),
 (42, 'pizzonia'),
 (43, 'matta'),
 (44, 'panis'),
 (45, 'pantano'),
 (46, 'bruni'),
 (47, 'baumgartner'),
 (48, 'gene'),
 (49, 'frentzen'),
 (50, 'verstappen'),
 (51, 'wilson'),
 (52, 'firman'),
 (53, 'kiesa'),
 (54, 'burti'),
 (55, 'alesi

In [44]:
translate_driver_id = dict(translate_driver_id)

In [45]:
for (driver, amount) in top_10_drivers_most_races:
    print(f"{translate_driver_id[driver]} : {amount}")

barrichello : 326
button : 309
michael_schumacher : 308
alonso : 293
raikkonen : 273
massa : 271
patrese : 257
trulli : 256
coulthard : 247
fisichella : 231


## DataFrame

In [46]:
spark = SparkSession(sc)

In [47]:
dfr = spark.read.format("csv").option("header", "true").load(F1_RESULTS_PATH)

In [51]:
dfr.head(1)

[Row(resultId='1', raceId='18', driverId='1', constructorId='1', number='22', grid='1', position='1', positionText='1', positionOrder='1', points='10', laps='58', time='34:50.6', milliseconds='5690616', fastestLap='39', rank='2', fastestLapTime='01:27.5', fastestLapSpeed='218.3', statusId='1')]

In [52]:
dfd = spark.read.format("csv").option("header", "true").load(F1_DRIVERS_PATH)

In [53]:
dfd.head(1)

[Row(driverId='1', driverRef='hamilton', number='44', code='HAM', forename='Lewis', surname='Hamilton', dob='07/01/1985', nationality='British', url='http://en.wikipedia.org/wiki/Lewis_Hamilton')]

In [54]:
dfr.join(
    dfd, dfr.driverId == dfd.driverId, "inner"
).groupBy(
    dfr.driverId, dfd.driverRef
).agg(
    count(dfr.raceId).alias("races")
).orderBy(
    col("races").desc()
).limit(10).show()

+--------+------------------+-----+
|driverId|         driverRef|races|
+--------+------------------+-----+
|      22|       barrichello|  326|
|      18|            button|  309|
|      30|michael_schumacher|  308|
|       4|            alonso|  293|
|       8|         raikkonen|  273|
|      13|             massa|  271|
|     119|           patrese|  257|
|      15|            trulli|  256|
|      14|         coulthard|  247|
|      21|        fisichella|  231|
+--------+------------------+-----+



In [56]:
dfr.registerTempTable("races")

In [57]:
dfd.registerTempTable("drivers")

In [59]:
spark.sql("""
select r.driverId, d.driverRef, count(0) races
from races r
  inner join drivers d on r.driverId = d.driverId
group by r.driverId, d.driverRef
order by races desc
limit 10
""").show()

+--------+------------------+-----+
|driverId|         driverRef|races|
+--------+------------------+-----+
|      22|       barrichello|  326|
|      18|            button|  309|
|      30|michael_schumacher|  308|
|       4|            alonso|  293|
|       8|         raikkonen|  273|
|      13|             massa|  271|
|     119|           patrese|  257|
|      15|            trulli|  256|
|      14|         coulthard|  247|
|      21|        fisichella|  231|
+--------+------------------+-----+

