In [1]:
import itertools
import gc # for removing rdds from memory
from pyspark import SparkContext

In [2]:
sc = SparkContext(master='local', appName="Assignment1_E1")

22/04/24 12:34:58 WARN Utils: Your hostname, Luiss-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.126 instead (on interface en0)
22/04/24 12:34:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/24 12:34:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
#hdfs dfs -mkdir -p data
#hdfs dfs -put data/small_conditions.csv data/

In [4]:
SUPPORT_THRESHOLD = 1000
data = sc.textFile("data/conditions.csv.gz")
header = data.first() #extract header

                                                                                

In [5]:
# START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
# PATIENT is the patient identifier
# CODE is a condition identifier 
# DESCRIPTION is the name of the condition

In [6]:
# Reorganizing the data to a list of baskets (lists)
# and removing the header row
lines = data.filter(lambda row: row != header) \
                .map(lambda line: tuple(line.split(",")))

lines.take(3)

                                                                                

[('2017-01-14',
  '2017-03-30',
  '09e4e8cb-29c2-4ef4-86c0-a6ff0ba25d2a',
  '88e540ab-a7d7-47de-93c1-720a06f3d601',
  '65363002',
  'Otitis media'),
 ('2012-09-15',
  '2012-09-16',
  'b0a03e8c-8d0f-4242-9548-40f4d294eba8',
  'e89414dc-d0c6-478f-86c0-d08bac6ad0a2',
  '241929008',
  'Acute allergic reaction'),
 ('2018-06-17',
  '2018-06-24',
  '09e4e8cb-29c2-4ef4-86c0-a6ff0ba25d2a',
  'c14325b0-f7ec-4314-bba8-dddc37f0067d',
  '444814009',
  'Viral sinusitis (disorder)')]

In [7]:
# Freeing memory
del data
gc.collect()

0

In [8]:
conditions = lines.map(lambda x: (int(x[4]), x[5])) \
                .distinct() \
                .collectAsMap()

#conditions

                                                                                

In [9]:
item_baskets = lines.map(lambda x: (x[2], {x[4]})) \
                    .reduceByKey(lambda a, b: a | b) \
                    .map(lambda x: tuple(x[1]))
                    

item_baskets.take(3)

                                                                                

[('444814009', '241929008', '70704007'),
 ('444814009', '40055000', '59621000'),
 ('195662009', '444814009', '16114001', '162864005')]

In [10]:
#item_baskets.count()

In [11]:
# Freeing memory
del lines
gc.collect()

238

## Apriori Phase 1

In [12]:
# Flat listing all the baskets 
freqItemCounts = item_baskets.flatMap(lambda x: x) \
                    .map(lambda item: (item, 1)) \
                    .reduceByKey(lambda a, b: a + b) \
                    .filter(lambda item: item[1] >= SUPPORT_THRESHOLD)
                    

# Mapping -> create pairs (item, 1)
#itemPairs = items.map(lambda item: (item, 1))

# Reducing
#itemCounts = itemPairs.reduceByKey(lambda a, b: a + b)

# Keeping only the ones above the support threshold
#freqItemCounts = itemCounts.filter(lambda item: item[1] >= SUPPORT_THRESHOLD)


# Taking the 10 most frequent itemsets for k = 1
#freqItemCounts.takeOrdered(10, key=lambda x: -x[1])

## Intermediate step

In [13]:
# Creating the frequent items table
freq_item_count = freqItemCounts.collect()

freqItemTable = freqItemCounts.map(lambda x: x[0]).collect()

#freqItemTable.take(10)

                                                                                

In [14]:
# In order for a pair to be frequent both its items have to be frequent. 
# As such, we can remove the unfrequent items from the baskets.

# Remove the unfrequent items from the baskets
item_baskets = item_baskets.filter(lambda basket: {item for item in basket if item in freqItemTable}) 
                            #.map(lambda x: tuple(x))    


#item_baskets.take(3)

## Phase 2, k = 2

In [15]:
# Generate all the possible pairs / triples / etc from the
# combinations of the items of each basket. 

def freq_n_uple(basket, k):
    candidate_n_uple = itertools.combinations(basket, k)
    for n_uple in candidate_n_uple:
        yield(n_uple, 1)

In [17]:
# Convert to a list to avoid problems due to passing a rdd to another rdd
#table = freqItemTable.collect()

# counting pairs of frequent items
pairs = item_baskets.flatMap(lambda x: freq_n_uple(x, 2)) \
                    .reduceByKey(lambda v1, v2: v1 + v2) \
                    .filter(lambda x: x[1] >= SUPPORT_THRESHOLD) \
                    .sortBy(lambda x: x[1], ascending=False)
                    
pairs.take(10)

                                                                                

[(('15777000', '271737000'), 289116),
 (('444814009', '195662009'), 265507),
 (('444814009', '162864005'), 240844),
 (('10509002', '444814009'), 238167),
 (('15777000', '444814009'), 222725),
 (('271737000', '444814009'), 218281),
 (('59621000', '444814009'), 174520),
 (('10509002', '195662009'), 167718),
 (('271737000', '195662009'), 152499),
 (('40055000', '444814009'), 150711)]

In [18]:
frequent_pairs_count = pairs.collect()

#frequent_pairs = pairs.map(lambda x: x[0]).collect()

#frequent_pairs

In [19]:
# Only worth checking for triples in items that are in frequent pairs.
# So we create a table similar to the frequent items table in order to remove
# unfrequent items from the baskets.

freq_pair_table = pairs.flatMap(lambda x: x[0]) \
                        .distinct() \
                        .collect()

#len(freq_pair_table)

In [20]:
# Removing unfrequent items from the baskets and droping baskets 
# with fewer than 3 items because we need ate least 3 items to make a triple.

item_baskets = item_baskets.filter(lambda basket: {item for item in basket if item in freq_pair_table}) \
                            .filter(lambda x: len(x) > 2 )
                            

#item_baskets.take(3)

## Phase 2, k = 3

In [21]:
triples = item_baskets.flatMap(lambda x: freq_n_uple(x, 3)) \
                    .reduceByKey(lambda v1, v2: v1 + v2) \
                    .filter(lambda x: x[1] >= SUPPORT_THRESHOLD) \
                    .sortBy(lambda x: x[1], ascending=False)

triples.take(10)

                                                                                

[(('15777000', '271737000', '444814009'), 177924),
 (('15777000', '271737000', '195662009'), 124307),
 (('10509002', '444814009', '195662009'), 112687),
 (('15777000', '444814009', '195662009'), 102564),
 (('271737000', '444814009', '195662009'), 100769),
 (('444814009', '162864005', '195662009'), 99780),
 (('15777000', '10509002', '271737000'), 95176),
 (('10509002', '444814009', '162864005'), 86641),
 (('15777000', '10509002', '444814009'), 85968),
 (('59621000', '444814009', '195662009'), 81822)]

In [22]:
frequent_triples = triples.map(lambda x: x[0]).collect()
#frequent_triples

                                                                                

[('15777000', '271737000', '444814009'),
 ('15777000', '271737000', '195662009'),
 ('10509002', '444814009', '195662009'),
 ('15777000', '444814009', '195662009'),
 ('271737000', '444814009', '195662009'),
 ('444814009', '162864005', '195662009'),
 ('15777000', '10509002', '271737000'),
 ('10509002', '444814009', '162864005'),
 ('15777000', '10509002', '444814009'),
 ('59621000', '444814009', '195662009'),
 ('15777000', '59621000', '271737000'),
 ('15777000', '271737000', '162864005'),
 ('15777000', '40055000', '271737000'),
 ('15777000', '59621000', '444814009'),
 ('10509002', '271737000', '444814009'),
 ('40055000', '444814009', '195662009'),
 ('15777000', '444814009', '162864005'),
 ('59621000', '444814009', '162864005'),
 ('15777000', '40055000', '444814009'),
 ('40055000', '444814009', '162864005'),
 ('271737000', '444814009', '162864005'),
 ('15777000', '10509002', '195662009'),
 ('10509002', '162864005', '195662009'),
 ('59621000', '271737000', '444814009'),
 ('19169002', '44481

In [23]:
# triples.count()

## Mining Association Rules

In [24]:
# joining all the frequent baskets in one list
frequent_baskets = (freqItemCounts + pairs + triples).collect()
frequent_baskets

                                                                                

[('10509002', 461495),
 ('59621000', 305134),
 ('233678006', 25755),
 ('367498001', 15308),
 ('84757009', 22352),
 ('128613002', 42693),
 ('195662009', 524692),
 ('703151001', 42693),
 ('65363002', 134018),
 ('263102004', 34193),
 ('19169002', 201894),
 ('55822004', 133442),
 ('444814009', 751940),
 ('162864005', 365567),
 ('124171000119105', 55010),
 ('15777000', 354315),
 ('26929004', 34746),
 ('40055000', 250239),
 ('271737000', 355372),
 ('47693006', 16103),
 ('74400008', 53933),
 ('428251008', 53933),
 ('44465007', 118138),
 ('72892002', 205390),
 ('254837009', 21558),
 ('82423001', 54974),
 ('88805009', 56461),
 ('68496003', 85587),
 ('403190006', 26363),
 ('79586000', 25783),
 ('39848009', 54950),
 ('16114001', 34085),
 ('43878008', 153069),
 ('58150001', 38234),
 ('30832001', 7126),
 ('201834006', 25426),
 ('398254007', 22959),
 ('198992004', 22738),
 ('283385000', 34720),
 ('109838007', 8153),
 ('713197008', 25903),
 ('35999006', 17873),
 ('239873007', 61000),
 ('62106007', 66

In [25]:
# Mining for k = 2
mapa = dict(frequent_baskets)

rules = {}
for pair in pairs.collect():
    
    
    print(mapa[pair[0][0]])
    # Rule 1  X -> Y = #(X u Y) / #X
    confidence = int(pair[1]) / mapa[pair[0][0]]
    rules[pair[0][0]] = (pair[0][1], confidence)
    
    print(mapa[pair[0][1]])
    # Rule 2  Y -> X = #(X u Y) / #Y
    confidence = int(pair[1]) / mapa[pair[0][1]]
    rules[pair[0][1]] = (pair[0][0], confidence)

#rules


354315
355372
751940
524692
751940
365567
461495
751940
354315
751940
355372
751940
305134
751940
461495
524692
355372
524692
250239
751940
354315
524692
365567
524692
354315
461495
461495
365567
201894
751940
305134
524692
354315
305134
461495
355372
250239
524692
354315
365567
354315
250239
305134
365567
355372
365567
305134
355372
250239
365567
751940
153069
201894
365567
250239
355372
201894
524692
461495
305134
201894
354315
133442
751940
205390
751940
201894
461495
205390
524692
524692
751940
201894
205390
461495
205390
751940
134018
201894
355372
133442
365567
118138
751940
133442
354315
74395
77306
250239
305134
133442
355372
751940
461495
201894
305134
77306
75992
74395
75992
461495
153069
355372
205390
133442
524692
77306
355372
305134
205390
201894
250239
751940
205390
85587
751940
74395
355372
205390
365567
133442
461495
153069
524692
524692
134018
53933
53933
77306
751940
250239
461495
355372
75992
354315
205390
461495
250239
118138
461495
354315
77306
118138
524692
354315

In [26]:
for triple in triples.collect():
    pass

                                                                                