In [117]:
import itertools
from pyspark import SparkContext

In [118]:
sc = SparkContext(master='local', appName="Assignment1_E1")

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=Assignment1_E1, master=local) created by __init__ at /var/folders/_l/6yn52l4165j2hbxlxj7ct8640000gn/T/ipykernel_47153/3017175912.py:1 

In [None]:
#hdfs dfs -mkdir -p data
#hdfs dfs -put data/small_conditions.csv data/

In [119]:
SUPPORT_THRESHOLD = 20
data = sc.textFile("data/small_conditions.csv")
header = data.first() #extract header

In [120]:
# START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
# PATIENT is the patient identifier
# CODE is a condition identifier 
# DESCRIPTION is the name of the condition

In [121]:
# Reorganizing the data to a list of baskets (lists)
# and removing the header row
item_baskets = data.filter(lambda row: row != header)\
    .map(lambda line: tuple(line.split(",")))

item_baskets.take(3)

[('2017-01-14',
  '2017-03-30',
  '09e4e8cb-29c2-4ef4-86c0-a6ff0ba25d2a',
  '88e540ab-a7d7-47de-93c1-720a06f3d601',
  '65363002',
  'Otitis media'),
 ('2012-09-15',
  '2012-09-16',
  'b0a03e8c-8d0f-4242-9548-40f4d294eba8',
  'e89414dc-d0c6-478f-86c0-d08bac6ad0a2',
  '241929008',
  'Acute allergic reaction'),
 ('2018-06-17',
  '2018-06-24',
  '09e4e8cb-29c2-4ef4-86c0-a6ff0ba25d2a',
  'c14325b0-f7ec-4314-bba8-dddc37f0067d',
  '444814009',
  'Viral sinusitis (disorder)')]

## Apriori Phase 1

In [122]:
# Flat listing all the baskets 
items = item_baskets.flatMap(lambda x: x)

# Mapping -> create pairs (item, 1)
itemPairs = items.map(lambda item: (item, 1))

# Reducing
itemCounts = itemPairs.reduceByKey(lambda a, b: a + b)

# Keeping only the ones above the support threshold
freqItemCounts = itemCounts.filter(lambda item: item[1] >= SUPPORT_THRESHOLD)

#items.take(10)
#itemPairs.take(10)
#itemCounts.take(10)

# Taking the 10 most frequent itemsets for k = 1
freqItemCounts.takeOrdered(10, key=lambda x: -x[1])

[('', 783),
 ('444814009', 320),
 ('Viral sinusitis (disorder)', 320),
 ('195662009', 193),
 ('Acute viral pharyngitis (disorder)', 193),
 ('10509002', 143),
 ('Acute bronchitis (disorder)', 143),
 ('72892002', 142),
 ('Normal pregnancy', 142),
 ('162864005', 90)]

## Intermediate step

In [123]:
# Creating the frequent items table
freqItemTable = freqItemCounts.map(lambda x: x[0])

freqItemTable.take(10)

['65363002',
 'Otitis media',
 '444814009',
 'Viral sinusitis (disorder)',
 '10509002',
 'Acute bronchitis (disorder)',
 '',
 '195662009',
 'Acute viral pharyngitis (disorder)',
 '72892002']

## Phase 2, k = 2

In [124]:
# Receives as input: the baskets and the frequent items table
# Returns: candidate frequent pairs
def freq_pairs(basket,table):
    for item_1 in range(0, len(basket)):
        if basket[item_1] not in table:
            continue
        for item_2 in range(item_1 + 1, len(basket)): # j > i 
            if basket[item_2] in table:
                yield(tuple(sorted((basket[item_1], basket[item_2]))), 1)


In [125]:
# Convert to a list to avoid problems due to passing a rdd to another rdd
table = freqItemTable.collect()

# counting pairs of frequent items
pairs = item_baskets.flatMap(lambda x: freq_pairs(x, table)) \
                    .reduceByKey(lambda v1, v2: v1 + v2) \
                    .filter(lambda x: x[1] >= SUPPORT_THRESHOLD) \
                    .sortBy(lambda x: x[1], ascending=False)
                    
                    
####### PERGUNTAR se fazer desta maneira é a mesma coisa que
# fazer todas as combinações e retirar os duplicados pq os valores são os mesmos

pairs.take(10)

[(('444814009', 'Viral sinusitis (disorder)'), 320),
 (('195662009', 'Acute viral pharyngitis (disorder)'), 193),
 (('10509002', 'Acute bronchitis (disorder)'), 143),
 (('72892002', 'Normal pregnancy'), 142),
 (('', '162864005'), 90),
 (('', 'Body mass index 30+ - obesity (finding)'), 90),
 (('162864005', 'Body mass index 30+ - obesity (finding)'), 90),
 (('271737000', 'Anemia (disorder)'), 81),
 (('', '15777000'), 76),
 (('', 'Prediabetes'), 76)]

## Phase 2, k = 3

In [126]:
# Receives as input: the baskets, frequent items and frequent pairs
# Returns: candidate frequent triples
def freq_triples(basket, table, fqt_pairs):
    for item_1 in range(0, len(basket)):
        if basket[item_1] not in table:
            continue
        for item_2 in range(item_1 + 1, len(basket)):  # j > i
            if basket[item_2] not in table:
                continue
                        
            pair = tuple(sorted((basket[item_1], basket[item_2])))
            if pair not in fqt_pairs:
                continue
            
            for item_3 in range(item_2 + 1, len(basket)):
                if basket[item_3] not in table:
                    continue
                
                candidate_pairs = list(itertools.combinations((item_1, item_2, item_3), 2))

                # if all candidate pairs are frequent pairs yield the candidate triple
                if all(candidate_pair in fqt_pairs for candidate_pair in candidate_pairs):
                    continue
                    
                yield(tuple(sorted((basket[item_1], basket[item_2], basket[item_3]))), 1)


In [127]:
frequent_pairs = pairs.map(lambda x: x[0]).collect()

triples = item_baskets.flatMap(lambda x: freq_triples(x, table, frequent_pairs)) \
                    .reduceByKey(lambda v1, v2: v1 + v2) \
                    .filter(lambda x: x[1] >= SUPPORT_THRESHOLD) \
                    .sortBy(lambda x: x[1], ascending=False)

triples.collect()

[(('', '162864005', 'Body mass index 30+ - obesity (finding)'), 90),
 (('', '15777000', 'Prediabetes'), 76),
 (('', '271737000', 'Anemia (disorder)'), 75),
 (('', '59621000', 'Hypertension'), 62),
 (('', '40055000', 'Chronic sinusitis (disorder)'), 53),
 (('', '19169002', 'Miscarriage in first trimester'), 40),
 (('', '55822004', 'Hyperlipidemia'), 24)]