In [None]:
import itertools
import gc # for removing rdds from memory
from pyspark import SparkContext



In [None]:
sc = SparkContext(master='local', appName="Assignment1_E1")

In [None]:
#hdfs dfs -mkdir -p data
#hdfs dfs -put data/small_conditions.csv data/

In [None]:
SUPPORT_THRESHOLD = 1000
data = sc.textFile("data/conditions.csv.gz")
header = data.first() #extract header

In [None]:
# START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
# PATIENT is the patient identifier
# CODE is a condition identifier 
# DESCRIPTION is the name of the condition

In [None]:
# Reorganizing the data to a list of baskets (lists)
# and removing the header row
lines = data.filter(lambda row: row != header) \
                .map(lambda line: tuple(line.split(",")))

lines.take(3)

In [None]:
# Freeing memory
del data
gc.collect()

In [None]:
conditions = lines.map(lambda x: (int(x[4]), x[5])) \
                .distinct() \
                .collectAsMap()

conditions

In [None]:
#conditions = dict(conditions)

In [None]:
item_baskets = lines.map(lambda x: (x[2], {x[4]})) \
                    .reduceByKey(lambda a, b: a | b) \
                    .map(lambda x: tuple(x[1]))
                    

item_baskets.take(10)

In [None]:
# Freeing memory
del lines
gc.collect()

## Apriori Phase 1

In [None]:
# Flat listing all the baskets 
freqItemCounts = item_baskets.flatMap(lambda x: x) \
                    .map(lambda item: (item, 1)) \
                    .reduceByKey(lambda a, b: a + b) \
                    .filter(lambda item: item[1] >= SUPPORT_THRESHOLD)

# Mapping -> create pairs (item, 1)
#itemPairs = items.map(lambda item: (item, 1))

# Reducing
#itemCounts = itemPairs.reduceByKey(lambda a, b: a + b)

# Keeping only the ones above the support threshold
#freqItemCounts = itemCounts.filter(lambda item: item[1] >= SUPPORT_THRESHOLD)


# Taking the 10 most frequent itemsets for k = 1
freqItemCounts.takeOrdered(10, key=lambda x: -x[1])

## Intermediate step

In [None]:
# Creating the frequent items table
freqItemTable = freqItemCounts.map(lambda x: x[0]).collect()

#freqItemTable.take(10)

## Phase 2, k = 2

In [None]:
# Receives as input: the baskets and the frequent items table
# Returns: candidate frequent pairs
def freq_pairs(basket,table):
    for item_1 in range(0, len(basket)):
        if basket[item_1] not in table:
            continue
        for item_2 in range(item_1 + 1, len(basket)): # j > i 
            if basket[item_2] in table:
                yield(tuple(sorted((basket[item_1], basket[item_2]))), 1)


In [None]:
# Convert to a list to avoid problems due to passing a rdd to another rdd
#table = freqItemTable.collect()

# counting pairs of frequent items
pairs = item_baskets.flatMap(lambda x: freq_pairs(x, freqItemTable)) \
                    .reduceByKey(lambda v1, v2: v1 + v2) \
                    .filter(lambda x: x[1] >= SUPPORT_THRESHOLD) \
                    .sortBy(lambda x: x[1], ascending=False)
                    
                    
####### PERGUNTAR se fazer desta maneira é a mesma coisa que
# fazer todas as combinações e retirar os duplicados pq os valores são os mesmos

pairs.take(10)

In [None]:
pairs.count()

## Phase 2, k = 3

In [None]:
# Receives as input: the baskets, frequent items and frequent pairs
# Returns: candidate frequent triples
def freq_triples(basket, table, fqt_pairs):
    for item_1 in range(0, len(basket)):
        if basket[item_1] not in table:
            continue
        for item_2 in range(item_1 + 1, len(basket)):  # j > i
            if basket[item_2] not in table:
                continue
                        
            pair = tuple(sorted((basket[item_1], basket[item_2])))
            if pair not in fqt_pairs:  # if the pair is not frequent the triple also won't be frequent
                continue
            
            for item_3 in range(item_2 + 1, len(basket)):
                if basket[item_3] not in table:
                    continue
                
                candidate_pairs = list(itertools.combinations((item_1, item_2, item_3), 2))

                # if all candidate pairs are frequent pairs yield the candidate triple
                if all(candidate_pair in fqt_pairs for candidate_pair in candidate_pairs):
                    continue
                    
                yield(tuple(sorted((basket[item_1], basket[item_2], basket[item_3]))), 1)


In [None]:
frequent_pairs = pairs.flatMap(lambda x: set(x[0])).collect()

frequent_pairs

In [None]:
frequent_pairs = pairs.flatMap(lambda x: set(x[0])) \
                    .collect()

triples = item_baskets.flatMap(lambda x: freq_triples(x, freqItemTable, frequent_pairs)) \
                    .reduceByKey(lambda v1, v2: v1 + v2) \
                    .filter(lambda x: x[1] >= SUPPORT_THRESHOLD) \
                    .sortBy(lambda x: x[1], ascending=False)

triples.take(10)

In [None]:
frequent_triples = triples.map(lambda x: x[0]).collect()
frequent_triples

## Mining Association Rules

In [None]:
# joining all the frequent baskets in one list
frequent_baskets = (freqItemCounts + pairs + triples).collect()
frequent_baskets[0:10]

In [None]:
# Mining for k = 2
mapa = dict(frequent_baskets)

rules = {}
for pair in pairs.collect():
    
    
    print(mapa[pair[0][0]])
    # Rule 1  X -> Y = #(X u Y) / #X
    confidence = int(pair[1]) / mapa[pair[0][0]]
    rules[pair[0][0]] = (pair[0][1], confidence)
    
    print(mapa[pair[0][1]])
    # Rule 2  Y -> X = #(X u Y) / #Y
    confidence = int(pair[1]) / mapa[pair[0][1]]
    rules[pair[0][1]] = (pair[0][0], confidence)

#rules


In [None]:
for triple in triples.collect():
    pass