In [None]:
#from pyspark import SparkContext
#from pyspark.sql import SparkSession

In [None]:
#spark = SparkSession.builder.appName('Exam20210705').getOrCreate()
#sc = spark.sparkContext

In [None]:
items_path = '../exampleData/items.txt'
ads_path = '../exampleData/ads_sales.txt'
users_path = '../exampleData/users.txt'

out1 = 'out1'
out2 = 'out2'

In [None]:
items_rdd = sc.textFile(items_path)
ads_rdd = sc.textFile(ads_path)

In [None]:
### PART 1
# filter only the items that were actually purchased
purchased_rdd = ads_rdd.filter(lambda line: line.split(',')[3] == 'true').cache()

In [None]:
def itemid_price(line):
    fields = line.split(',')
    itemid = fields[2]
    price = float(fields[4])
    return (itemid, price)

# obtain the pair rdd
# (itemId, salePrice)
sales_price_rdd = purchased_rdd.map(itemid_price)

In [None]:
def itemid_recomPrice_category(line):
    fields = line.split(',')
    itemid = fields[0]
    recomPrice = float(fields[2])
    category = fields[3]

    return (itemid, (recomPrice, category))

# obtain the pair rdd
# (itemId, (recommendedPrice, category))
items_category_recomPrice_rdd = items_rdd.map(itemid_recomPrice_category).cache()

In [None]:
# join the two pair rdds to obtain
# (itemId, (salePrice, (recommendedPrice, category)))
item_category_prices_rdd = sales_price_rdd.join(items_category_recomPrice_rdd)

In [None]:
def obtain_counters(item):
    itemId = item[0]
    sale_price = item[1][0]
    recommended_price, category = item[1][1]

    numerator = 1 if sale_price > recommended_price else 0
    
    return ((itemId, category), (numerator, 1))

# map each element into a tuple of counters and keep the category field (by moving it into the key):
# one counter for numerator (= number of times the item was sold at price > recommendedPrice)
# one counter for denominator (total number of times the item was sold)
# ((itemId, category), (numerator, denominator))
item_counters_category_rdd = item_category_prices_rdd.map(obtain_counters)

In [None]:
# sum all the counters and compute the percentage
# ((itemId, category), percentage)
percentage_rdd = item_counters_category_rdd.reduceByKey(lambda v1, v2: (v1[0] + v2[0], v1[1] + v2[1]))\
                                            .mapValues(lambda it: it[0] / it[1])

In [None]:
# get the result by filtering only those elements with percentage > 90% and keeping only the key part
result = percentage_rdd.filter(lambda it: it[1] >= 0.9)\
.keys()

In [None]:
result.saveAsTextFile(out1)

In [None]:
### PART 2
# use an rdd computed in part 1 containing items with itemId, recommendedPrice and category
# and keep only itemId, category
# (itemId, (recommendedPrice, category))
# ->
# (itemId, category)
category_per_item_rdd = items_category_recomPrice_rdd.map(lambda it: (it[0], it[1][1]))

In [None]:
def itemid_profit(line):
    fields = line.split(',')
    itemid = fields[2]
    price = float(fields[4])
    return (itemid, price)

# map purchased_rdd into an rdd containing itemId and price
# Only the lines with purchased equal to true are considered (the others have Price equal to 0 
# and hence are useless. For this reason we can use purchased_rdd instaed of ads_rdd)
# (itemId, profitPerSingleItem)
profits_per_item_rdd = purchased_rdd.map(itemid_profit)

In [None]:
# compute the total profits for each item
# (itemId, profit)
total_profits_rdd = profits_per_item_rdd.reduceByKey(lambda v1, v2: v1 + v2)

In [None]:
# Use left outer join to join the two rdds previously computed to obtain the following pairRDD
# (itemId, (category, profit))
# note that profit is not defined for all items: for unadvertised items, profit is None
unadv_profits_rdd = category_per_item_rdd.leftOuterJoin(total_profits_rdd)

In [None]:
# keep only unadvertised and low-profit items
lowprofits_unadv_items = unadv_profits_rdd.filter(lambda it: it[1][1] is None or it[1][1] <= 100)

In [None]:
def determine_low_profit_and_unadv(item):
    category = item[1][0]
    profit = item[1][1]
    x = 1 if profit is None else 0
    y = 1 if (not profit is None) and profit <= 100 else 0

    return (category, (x, y))

# map each element into a pair (category, (x, y))
# where:
# x = 1 if the item is unadvertised, 0 otherwise
# y = 1 if the item is low-profit, 0 otherwise
# and count for each category the number of unadvertised and low-profits items
# (category, (# items unadvertised, # items with low-profits))
lowprofits_unadv_per_category = lowprofits_unadv_items.map(determine_low_profit_and_unadv)\
                                                .reduceByKey(lambda v1, v2: (v1[0] + v2[0], v1[1] + v2[1]))

In [None]:
# filter only those categories with at least 10 low-profit items
# and at least 10 unadvertised items
result2 = lowprofits_unadv_per_category.filter(lambda it: it[1][0] >= 10 and it[1][1] >= 10)
# for testing purposes, set thresholds to 2 instead of 10
#result2 = lowprofits_unadv_per_category.filter(lambda it: it[1][0] >= 2 and it[1][1] >= 2) 

In [None]:
result2.saveAsTextFile(out2)