# Pares RDD (Pair RDD)

In [1]:
# Importando arquivo csv e criando um RDD
carros = sc.textFile("data/carros.csv")

In [2]:
carros.take(5)

['MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE',
 'subaru,gas,std,two,hatchback,fwd,four,69,4900,31,36,5118',
 'chevrolet,gas,std,two,hatchback,fwd,three,48,5100,47,53,5151',
 'mazda,gas,std,two,hatchback,fwd,four,68,5000,30,31,5195',
 'toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348']

In [3]:
# Criando um Pair RDD
carrosPairRDD = carros.map(lambda x: (x.split(",")[0], x.split(",")[7]))
carrosPairRDD.take(5)

[('MAKE', 'HP'),
 ('subaru', '69'),
 ('chevrolet', '48'),
 ('mazda', '68'),
 ('toyota', '62')]

In [4]:
# Removendo o cabeçalho
header = carrosPairRDD.first()
carrosPairRDD2 = carrosPairRDD.filter(lambda line: line != header)

In [5]:
# Encontra a média de HP por marca de carro e adiciona 1 a cada contagem
addOne = carrosPairRDD2.mapValues(lambda x: (x, 1))
addOne.collect()

[('subaru', ('69', 1)),
 ('chevrolet', ('48', 1)),
 ('mazda', ('68', 1)),
 ('toyota', ('62', 1)),
 ('mitsubishi', ('68', 1)),
 ('honda', ('60', 1)),
 ('nissan', ('69', 1)),
 ('dodge', ('68', 1)),
 ('plymouth', ('68', 1)),
 ('mazda', ('68', 1)),
 ('mitsubishi', ('68', 1)),
 ('dodge', ('68', 1)),
 ('plymouth', ('68', 1)),
 ('chevrolet', ('70', 1)),
 ('toyota', ('62', 1)),
 ('dodge', ('68', 1)),
 ('honda', ('58', 1)),
 ('toyota', ('62', 1)),
 ('honda', ('76', 1)),
 ('chevrolet', ('70', 1)),
 ('nissan', ('69', 1)),
 ('mitsubishi', ('68', 1)),
 ('dodge', ('68', 1)),
 ('plymouth', ('68', 1)),
 ('mazda', ('68', 1)),
 ('isuzu', ('78', 1)),
 ('mazda', ('68', 1)),
 ('nissan', ('69', 1)),
 ('honda', ('76', 1)),
 ('toyota', ('62', 1)),
 ('toyota', ('70', 1)),
 ('mitsubishi', ('88', 1)),
 ('subaru', ('73', 1)),
 ('nissan', ('55', 1)),
 ('subaru', ('82', 1)),
 ('honda', ('76', 1)),
 ('toyota', ('70', 1)),
 ('honda', ('76', 1)),
 ('honda', ('76', 1)),
 ('nissan', ('69', 1)),
 ('nissan', ('69', 1)),
 

In [6]:
# Aplicando redução por key (reduceByKey) e conta a quantidade de fabricantes
# de carro, a média de HP e o nome do fabricante
fabricantes = addOne.reduceByKey(lambda x, y: (int(y[0]), x[1] + y[1]))
fabricantes.collect()

[('chevrolet', (70, 3)),
 ('mazda', (72, 16)),
 ('mitsubishi', (145, 13)),
 ('nissan', (200, 18)),
 ('dodge', (145, 8)),
 ('plymouth', (145, 7)),
 ('saab', (160, 6)),
 ('volvo', (114, 11)),
 ('alfa-romero', (154, 3)),
 ('mercedes-benz', (184, 8)),
 ('jaguar', (262, 3)),
 ('subaru', (111, 12)),
 ('toyota', (116, 32)),
 ('honda', (101, 13)),
 ('isuzu', (90, 2)),
 ('volkswagen', (68, 12)),
 ('peugot', (142, 11)),
 ('audi', (140, 6)),
 ('bmw', (182, 8)),
 ('mercury', ('175', 1)),
 ('porsche', (207, 4))]

In [7]:
# Calculando a média de HP dividindo pela contagem total
fabricantes.mapValues(lambda x: int(x[0]) / int(x[1])).collect()

[('chevrolet', 23.333333333333332),
 ('mazda', 4.5),
 ('mitsubishi', 11.153846153846153),
 ('nissan', 11.11111111111111),
 ('dodge', 18.125),
 ('plymouth', 20.714285714285715),
 ('saab', 26.666666666666668),
 ('volvo', 10.363636363636363),
 ('alfa-romero', 51.333333333333336),
 ('mercedes-benz', 23.0),
 ('jaguar', 87.33333333333333),
 ('subaru', 9.25),
 ('toyota', 3.625),
 ('honda', 7.769230769230769),
 ('isuzu', 45.0),
 ('volkswagen', 5.666666666666667),
 ('peugot', 12.909090909090908),
 ('audi', 23.333333333333332),
 ('bmw', 22.75),
 ('mercury', 175.0),
 ('porsche', 51.75)]

# Accumulators e Broadcast

In [8]:
# Inicializando variáveis Accumulator
sedanCount = sc.accumulator(0)
hatchbackCount = sc.accumulator(0)

In [10]:
# Inicializando variáveis Broadcast
sedanText = sc.broadcast("sedan")
hatchbackText = sc.broadcast("hatchback")

In [11]:
def splitLines(line):
    
    global sedanCount
    global hatchbackCount
    
    # Usa a variavel broadcast para comparar e configurar o accumulator
    if sedanText.value in line:
        sedanCount += 1
    if hatchbackText.value in line:
        hatchbackCount += 1
        
    return line.split(",")

In [13]:
# map()
splitData = carros.map(splitLines)

In [14]:
# Ação para executar a transformação (lazy evaluation)
splitData.count()
print(sedanCount, hatchbackCount)

92 67


# Partições

In [15]:
fabricantes.getNumPartitions()

2

In [16]:
# Especificando o número de partiçõees
collData = sc.parallelize([3, 5, 4, 7, 4],3)
collData.cache()
collData.count()

5

In [17]:
collData.getNumPartitions()

3

In [18]:
print(sc.defaultParallelism)

2
