In [1]:
#Load the file. Lazy initialization
autoData = sc.textFile("data/auto-data.csv")
autoData.cache()
#Loads only now.
autoData.count()
autoData.first()
autoData.take(5)

[u'MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE',
 u'subaru,gas,std,two,hatchback,fwd,four,69,4900,31,36,5118',
 u'chevrolet,gas,std,two,hatchback,fwd,three,48,5100,47,53,5151',
 u'mazda,gas,std,two,hatchback,fwd,four,68,5000,30,31,5195',
 u'toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348']

In [2]:
for line in autoData.collect():
    print line

MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE
subaru,gas,std,two,hatchback,fwd,four,69,4900,31,36,5118
chevrolet,gas,std,two,hatchback,fwd,three,48,5100,47,53,5151
mazda,gas,std,two,hatchback,fwd,four,68,5000,30,31,5195
toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348
mitsubishi,gas,std,two,hatchback,fwd,four,68,5500,37,41,5389
honda,gas,std,two,hatchback,fwd,four,60,5500,38,42,5399
nissan,gas,std,two,sedan,fwd,four,69,5200,31,37,5499
dodge,gas,std,two,hatchback,fwd,four,68,5500,37,41,5572
plymouth,gas,std,two,hatchback,fwd,four,68,5500,37,41,5572
mazda,gas,std,two,hatchback,fwd,four,68,5000,31,38,6095
mitsubishi,gas,std,two,hatchback,fwd,four,68,5500,31,38,6189
dodge,gas,std,four,hatchback,fwd,four,68,5500,31,38,6229
plymouth,gas,std,four,hatchback,fwd,four,68,5500,31,38,6229
chevrolet,gas,std,two,hatchback,fwd,four,70,5400,38,43,6295
toyota,gas,std,two,hatchback,fwd,four,62,4800,31,38,6338
dodge,gas,std,two,hatchback,fwd,four,68,5500,31,38,6377

In [3]:
#............................................................................
##   Loading Data From a Collection
#............................................................................
collData=sc.parallelize([3,5,4,7,4])
collData.cache()
collData.count()

5

In [4]:
#............................................................................
##   Transformations
#............................................................................


In [5]:
#Map and create a new RDD
tsvData=autoData.map(lambda x : x.replace(",","\t"))
tsvData.take(5)

[u'MAKE\tFUELTYPE\tASPIRE\tDOORS\tBODY\tDRIVE\tCYLINDERS\tHP\tRPM\tMPG-CITY\tMPG-HWY\tPRICE',
 u'subaru\tgas\tstd\ttwo\thatchback\tfwd\tfour\t69\t4900\t31\t36\t5118',
 u'chevrolet\tgas\tstd\ttwo\thatchback\tfwd\tthree\t48\t5100\t47\t53\t5151',
 u'mazda\tgas\tstd\ttwo\thatchback\tfwd\tfour\t68\t5000\t30\t31\t5195',
 u'toyota\tgas\tstd\ttwo\thatchback\tfwd\tfour\t62\t4800\t35\t39\t5348']

In [6]:
#Filter and create a new RDD
toyotaData=autoData.filter(lambda x: "toyota" in x)
toyotaData.count()

32

In [7]:
#FlatMap
words=toyotaData.flatMap(lambda line: line.split(","))
words.take(20)

[u'toyota',
 u'gas',
 u'std',
 u'two',
 u'hatchback',
 u'fwd',
 u'four',
 u'62',
 u'4800',
 u'35',
 u'39',
 u'5348',
 u'toyota',
 u'gas',
 u'std',
 u'two',
 u'hatchback',
 u'fwd',
 u'four',
 u'62']

In [8]:
#Distinct
for numbData in collData.distinct().collect():
    print numbData

4
3
5
7


In [9]:
#Set operations
words1 = sc.parallelize(["hello","war","peace","world"])
words2 = sc.parallelize(["war","peace","universe"])

In [10]:
for unions in words1.union(words2).distinct().collect():
    print unions

world
universe
hello
war
peace


In [11]:
for intersects in words1.intersection(words2).collect():
    print intersects


war
peace


In [12]:
#............................................................................
##   Actions
#............................................................................

In [13]:
#reduce
collData.reduce(lambda x,y: x+y)

23

In [14]:
#find the shortest line
autoData.reduce(lambda x,y: x if len(x) < len(y) else y)

u'bmw,gas,std,two,sedan,rwd,six,182,5400,16,22,41315'

In [15]:
#Aggregations

In [16]:
#Perform the same work as reduce
seqOp = (lambda x, y: (x+y))
combOp = (lambda x, y: (x+y))
collData.aggregate((0), seqOp, combOp)

23

In [17]:
#Do addition and multiplication at the same time.
#X now becomes a tuple for sequence
seqOp = (lambda x, y: (x[0]+y, x[1]*y))
#both X and Y are tuples
combOp = (lambda x, y: (x[0]+y[0], x[1]*y[1]))
collData.aggregate((0,1), seqOp, combOp)

(23, 1680)

In [18]:
#............................................................................
##   Functions in Spark
#............................................................................

In [19]:
#cleanse and transform an RDD
def cleanseRDD(autoStr) :
    if isinstance(autoStr, int) :
        return autoStr
    attList=autoStr.split(",")
    #convert doors to a number
    if attList[3] == "two" :
         attList[3]="2"
    else :
         attList[3]="4"
    #Convert Drive to uppercase
    attList[5] = attList[5].upper()
    return ",".join(attList)

In [20]:
cleanedData=autoData.map(cleanseRDD)
cleanedData.collect()

[u'MAKE,FUELTYPE,ASPIRE,4,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE',
 u'subaru,gas,std,2,hatchback,FWD,four,69,4900,31,36,5118',
 u'chevrolet,gas,std,2,hatchback,FWD,three,48,5100,47,53,5151',
 u'mazda,gas,std,2,hatchback,FWD,four,68,5000,30,31,5195',
 u'toyota,gas,std,2,hatchback,FWD,four,62,4800,35,39,5348',
 u'mitsubishi,gas,std,2,hatchback,FWD,four,68,5500,37,41,5389',
 u'honda,gas,std,2,hatchback,FWD,four,60,5500,38,42,5399',
 u'nissan,gas,std,2,sedan,FWD,four,69,5200,31,37,5499',
 u'dodge,gas,std,2,hatchback,FWD,four,68,5500,37,41,5572',
 u'plymouth,gas,std,2,hatchback,FWD,four,68,5500,37,41,5572',
 u'mazda,gas,std,2,hatchback,FWD,four,68,5000,31,38,6095',
 u'mitsubishi,gas,std,2,hatchback,FWD,four,68,5500,31,38,6189',
 u'dodge,gas,std,4,hatchback,FWD,four,68,5500,31,38,6229',
 u'plymouth,gas,std,4,hatchback,FWD,four,68,5500,31,38,6229',
 u'chevrolet,gas,std,2,hatchback,FWD,four,70,5400,38,43,6295',
 u'toyota,gas,std,2,hatchback,FWD,four,62,4800,31,38,6338',
 u'dodge,ga

In [21]:
#Sue a function to perform reduce 
def getMPG( autoStr) :
    if isinstance(autoStr, int) :
        return autoStr
    attList=autoStr.split(",")
    if attList[9].isdigit() :
        return int(attList[9])
    else:
        return 0

In [22]:
#find average MPG-City for all cars    
autoData.reduce(lambda x,y : getMPG(x) + getMPG(y)) \
    / (autoData.count()-1)

25

In [23]:
#............................................................................
##   Working with Key/Value RDDs
#............................................................................

In [24]:
#create a KV RDD of auto Brand and Horsepower
cylData = autoData.map( lambda x: ( x.split(",")[0], \
    x.split(",")[7]))
cylData.take(5)
cylData.keys().collect()

[u'MAKE',
 u'subaru',
 u'chevrolet',
 u'mazda',
 u'toyota',
 u'mitsubishi',
 u'honda',
 u'nissan',
 u'dodge',
 u'plymouth',
 u'mazda',
 u'mitsubishi',
 u'dodge',
 u'plymouth',
 u'chevrolet',
 u'toyota',
 u'dodge',
 u'honda',
 u'toyota',
 u'honda',
 u'chevrolet',
 u'nissan',
 u'mitsubishi',
 u'dodge',
 u'plymouth',
 u'mazda',
 u'isuzu',
 u'mazda',
 u'nissan',
 u'honda',
 u'toyota',
 u'toyota',
 u'mitsubishi',
 u'subaru',
 u'nissan',
 u'subaru',
 u'honda',
 u'toyota',
 u'honda',
 u'honda',
 u'nissan',
 u'nissan',
 u'mazda',
 u'subaru',
 u'nissan',
 u'subaru',
 u'dodge',
 u'plymouth',
 u'mitsubishi',
 u'toyota',
 u'subaru',
 u'volkswagen',
 u'toyota',
 u'nissan',
 u'honda',
 u'toyota',
 u'toyota',
 u'dodge',
 u'plymouth',
 u'volkswagen',
 u'volkswagen',
 u'nissan',
 u'subaru',
 u'toyota',
 u'mitsubishi',
 u'volkswagen',
 u'toyota',
 u'nissan',
 u'toyota',
 u'toyota',
 u'mazda',
 u'volkswagen',
 u'mitsubishi',
 u'toyota',
 u'honda',
 u'mazda',
 u'dodge',
 u'plymouth',
 u'toyota',
 u'nissan

In [25]:
#Remove header row
header = cylData.first()
cylHPData= cylData.filter(lambda line: line != header)

In [26]:
#Add a count 1 to each record and then reduce to find totals of HP and counts
brandValues=cylHPData.mapValues(lambda x: (x, 1)) \
    .reduceByKey(lambda x, y: (int(x[0]) + int(y[0]), \
    x[1] + y[1])) 
brandValues.collect()

[(u'dodge', (675, 8)),
 (u'mercury', (u'175', 1)),
 (u'jaguar', (614, 3)),
 (u'alfa-romero', (376, 3)),
 (u'nissan', (1846, 18)),
 (u'toyota', (2969, 32)),
 (u'plymouth', (607, 7)),
 (u'mazda', (1390, 16)),
 (u'subaru', (1035, 12)),
 (u'peugot', (1098, 11)),
 (u'porsche', (764, 4)),
 (u'isuzu', (168, 2)),
 (u'chevrolet', (188, 3)),
 (u'honda', (1043, 13)),
 (u'volvo', (1408, 11)),
 (u'bmw', (1111, 8)),
 (u'mercedes-benz', (1170, 8)),
 (u'mitsubishi', (1353, 13)),
 (u'saab', (760, 6)),
 (u'volkswagen', (973, 12)),
 (u'audi', (687, 6))]

In [27]:
#find average by dividing HP total by count total
brandValues.mapValues(lambda x: int(x[0])/int(x[1])). \
    collect()

[(u'dodge', 84),
 (u'mercury', 175),
 (u'jaguar', 204),
 (u'alfa-romero', 125),
 (u'nissan', 102),
 (u'toyota', 92),
 (u'plymouth', 86),
 (u'mazda', 86),
 (u'subaru', 86),
 (u'peugot', 99),
 (u'porsche', 191),
 (u'isuzu', 84),
 (u'chevrolet', 62),
 (u'honda', 80),
 (u'volvo', 128),
 (u'bmw', 138),
 (u'mercedes-benz', 146),
 (u'mitsubishi', 104),
 (u'saab', 126),
 (u'volkswagen', 81),
 (u'audi', 114)]

In [28]:
#............................................................................
##   Advanced Spark : Accumulators & Broadcast Variables
#............................................................................

In [29]:
#function that splits the line as well as counts sedans and hatchbacks
#Speed optimization

    
#Initialize accumulator
sedanCount = sc.accumulator(0)
hatchbackCount =sc.accumulator(0)

In [30]:
#Set Broadcast variable
sedanText=sc.broadcast("sedan")
hatchbackText=sc.broadcast("hatchback")

In [31]:
def splitLines(line) :

    global sedanCount
    global hatchbackCount

    #Use broadcast variable to do comparison and set accumulator
    if sedanText.value in line:
        sedanCount +=1
    if hatchbackText.value in line:
        hatchbackCount +=1
        
    return line.split(",")

In [32]:
#do the map
splitData=autoData.map(splitLines)

In [33]:
#Make it execute the map (lazy execution)
splitData.count()
print sedanCount, hatchbackCount

92 67


In [34]:
#............................................................................
##   Advanced Spark : Partitions
#............................................................................

In [35]:
collData.getNumPartitions()

#Specify no. of partitions.
collData=sc.parallelize([3,5,4,7,4],2)
collData.cache()
collData.count()

5

In [36]:
collData.getNumPartitions()

2