In [6]:
import pyspark

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions

In [7]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [8]:
carModelsPath = 'data/CarModels.txt'
salesEUPath = 'data/SalesEU.txt'
salesExtraEUPath = 'data/SalesExtraEU.txt'


output1 = 'out1'
output2 = 'out2'

In [15]:
modelsRDD = sc.textFile(carModelsPath)
euSalesRDD = sc.textFile(salesEUPath)

def modelSalesMap(x):
    return (x[0], 1)

def modelPriceMap(x):
    modelId = x[0]
    priceTuple = (x[1], 1)
    return (modelId, priceTuple)

def keepModelPrice(line):
    fields = line.split(',')
    modelId = fields[2]
    price = float(fields[5])
    return (modelId, price)

# Select vehicles manufactured by FIAT
# (modelId, 'FIAT')
fiatModelsRDD = modelsRDD.filter(lambda line: line.split(',')[2] == 'FIAT')\
                            .map(lambda line: (line.split(',')[0], line.split(',')[2])) 

# Select only sales in Italy
# (modelId, price)
italySalesRDD = euSalesRDD.filter(lambda line: line.split(',')[4] == 'Italy')\
                            .map(keepModelPrice)

# Select sales of modelIDs manufactured by FIAT
# (modelId, price)
italyFiatSalesRDD = italySalesRDD.join(fiatModelsRDD)\
                            .map(lambda x: (x[0], x[1][0]))

# Count the sales in Italy for each modelId and select only the entries with sales > 1M
# (modelId, n. of sales)
modelItalyHighSalesRDD = italyFiatSalesRDD.map(modelSalesMap)\
                                    .reduceByKey(lambda v1, v2: v1 + v2)\
                                    .filter(lambda x: x[1] > 960) # 1000000 instead of 960, set for testing purposes

# Compute the average price in Italy for each modelId and select only the entries with avg price > 50k
# (modelId, (price, +1)) ----> (modelId, avg_price)
modelItalyHighAvgPriceRDD = italyFiatSalesRDD.map(modelPriceMap)\
                                    .reduceByKey(lambda v1, v2: (v1[0] + v2[0], v1[1] + v2[1]))\
                                    .mapValues(lambda x: x[0] / x[1])\
                                    .filter(lambda x: x[1] > 50000)

# Join the 2 RDDs and retrieve the keys only
fiatModelItalyHighSalesAndAvgPriceRDD = modelItalyHighSalesRDD.join(modelItalyHighAvgPriceRDD)\
                                                        .keys()

fiatModelItalyHighSalesAndAvgPriceRDD.saveAsTextFile(output1)

In [None]:
# ____________________________________________________
# Task 2 v1
# ____________________________________________________

In [30]:
extraEURDD = sc.textFile(salesExtraEUPath)

def modelIdSalesPerYear(line):
    fields = line.split(',')
    modelId = fields[2]
    date = fields[3]
    year = int(date.split('/')[0])
    return ((modelId, year), 1)

# Obtain for the 2 sales datasets and RDD with
# ((modelId, year), +1)
euSalesPerYearRDD = euSalesRDD.map(modelIdSalesPerYear)
extraEUSalesPerYearRDD = extraEURDD.map(modelIdSalesPerYear)

# Merge the two RDDs and count for each year and each modelId the number of sales
# ((modelId, year), n. of sales)
globalSalesPerYear = euSalesPerYearRDD.union(extraEUSalesPerYearRDD)\
                                        .reduceByKey(lambda v1, v2: v1 + v2)

def checkIncreasingSales(x):
    salesList = x[1]
    salesList.sort()
    
    lastYear = -1
    lastSales = -1
    for year, sales in salesList:
        # verification
        assert lastYear == -1 or (lastYear + 1) == year
        if sales <= lastSales:
            return False
        lastSales = sales
        lastYear = year
    return True

# Map each ((modelId, year), n. of sales) in (modelId, (year, n. of sales))
# Group by key to obtain a list containing 50 elements, one per year, with the number of sales for that year
# Analyze the list to check if the number of sales is always increasing
# We suppose that the number of year is small enough to be stored and analyzed in one single list.
modelIdsWithIncreasingSales = globalSalesPerYear.map(lambda x: (x[0][0], (x[0][1], x[1])))\
                                            .groupByKey()\
                                            .mapValues(list)\
                                            .filter(checkIncreasingSales)
modelIdsWithIncreasingSales.keys().saveAsTextFile(output2)