Extracting the association rules of an arbitrary document (relatively big). Each sentence is
a transaction. Use the modules to extract the information.

[article link](https://aeon.co/essays/being-underslept-and-out-of-sync-is-a-political-injustice)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Read txt files and perform preprocessing

In [2]:
textArr = np.loadtxt('./Q4/article.txt',
                     dtype=str,
                     delimiter='\n',
                     unpack=True)

In [3]:
# split the text into sentences
flattenedSentencesArr = np.hstack(np.char.split(textArr, sep='.'))
data = pd.DataFrame(flattenedSentencesArr)
data.columns = ['sentences']
data.head()

Unnamed: 0,sentences
0,"For Uber drivers trying to make ends meet, it ..."
1,It saves on a few journeys and helps make the...
2,It keeps a driver readily available for work ...
3,There are carparks where the sleeping bags co...
4,


In [4]:
# remove line breaks
data['sentences'] = data['sentences'].apply(lambda x: x.replace('\n', ''))
# drop empty sentences
data['sentences'] = data['sentences'].replace('', np.nan)
data = data.dropna()
data.sample(5)

Unnamed: 0,sentences
143,"From a libertarian perspective, it might be sa..."
64,Problems of sleep matter because they are not ...
198,It inhibits the cognitive functions required ...
213,"When people sleep in synchrony, there is less..."
274,"They may also be transient minorities, insofa..."


In [5]:
# remove symbols, transform to lowercase, split into array of word
import re
data['sentences'] = data['sentences'].apply(lambda x: x.lower())
data['sentences'] = data['sentences'].apply(lambda x: re.sub(r'[^\w]', ',', x))
data['sentences'] = data['sentences'].apply(lambda x: x.split(','))

In [6]:
# delete empty list
data['sentences'] = data['sentences'].apply(lambda x: [str for str in x if str])
# remove common words
common_words = ['the', 'a', 'an', 'and','of','in','is','are','was','were','that','this']
data['sentences'] = data['sentences'].apply(lambda x: [word for word in x if word not in common_words])
data.sample(5)

Unnamed: 0,sentences
64,"[because, other, they, just, matter, sleep, pr..."
40,"[journeys, hour, the, business, it, peak, save..."
120,"[told, the, in, 2016, guardian, as, doctor, on..."
6,"[how, get, temperature, adequate, deal, are, f..."
92,"[sector, the, many, service, examples, offers]"


In [10]:
data.shape

(240, 1)

## Perform association rule mining using apriori

In [132]:
#  install mlxtend
%pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.19.0-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.19.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
from mlxtend.preprocessing import TransactionEncoder

In [11]:
te = TransactionEncoder()
te_ary = te.fit(data['sentences']).transform(data['sentences'])
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,10,10pm,11am,1867,19,1900s,1941,1949,19th,2016,...,world,would,wrong,yawn,year,years,yet,yields,you,yourself
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
236,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
237,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
238,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [12]:
from mlxtend.frequent_patterns import apriori

In [216]:
df_apr = apriori(df, min_support=0.03, use_colnames=True)
df_apr

Unnamed: 0,support,itemsets
0,0.308333,(a)
1,0.033333,(also)
2,0.083333,(an)
3,0.400000,(and)
4,0.216667,(are)
...,...,...
632,0.033333,"(their, to, of, the)"
633,0.054167,"(the, to, of, with)"
634,0.033333,"(that, to, on, the)"
635,0.033333,"(that, to, sleep, the)"


In [218]:
df_apr.sort_values('support', ascending=False)

Unnamed: 0,support,itemsets,length
63,0.579167,(the),1
38,0.529167,(of),1
72,0.445833,(to),1
3,0.400000,(and),1
55,0.375000,(sleep),1
...,...,...,...
405,0.033333,"(more, of, and)",3
406,0.033333,"(and, of, on)",3
33,0.033333,(minorities),1
413,0.033333,"(on, and, the)",3


In [15]:
df_apr['length'] = df_apr['itemsets'].apply(lambda x: len(x))
df_apr.sample(3)

Unnamed: 0,support,itemsets,length
76,0.045833,"(be, they)",2
64,0.033333,(which),1
61,0.045833,(what),1


In [16]:
df_apr[(df_apr['length'] >= 3) & (df_apr['support'] > 0.05)]

Unnamed: 0,support,itemsets,length


In [18]:
from mlxtend.frequent_patterns import association_rules

In [21]:
rules = association_rules(df_apr, metric='lift', min_threshold=1)

In [23]:
rules.sample(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
19,(sleep),(can),0.375,0.1,0.054167,0.144444,1.444444,0.016667,1.051948
2,(it),(be),0.195833,0.183333,0.0375,0.191489,1.044487,0.001597,1.010088
22,(not),(it),0.079167,0.195833,0.033333,0.421053,2.150056,0.01783,1.389015
4,(be),(sleep),0.183333,0.375,0.091667,0.5,1.333333,0.022917,1.25
31,(their),(people),0.154167,0.079167,0.041667,0.27027,3.41394,0.029462,1.261883


In [24]:
rules.sort_values('confidence', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
33,(poor),(sleep),0.05,0.375,0.05,1.0,2.666667,0.03125,inf
34,(short),(sleep),0.041667,0.375,0.0375,0.9,2.4,0.021875,6.25
24,(less),(sleep),0.05,0.375,0.033333,0.666667,1.777778,0.014583,1.875
40,"(can, be)",(sleep),0.05,0.375,0.033333,0.666667,1.777778,0.014583,1.875
26,(one),(sleep),0.116667,0.375,0.075,0.642857,1.714286,0.03125,1.75
28,(other),(sleep),0.066667,0.375,0.041667,0.625,1.666667,0.016667,1.666667
41,"(can, sleep)",(be),0.054167,0.183333,0.033333,0.615385,3.356643,0.023403,2.123333
18,(can),(sleep),0.1,0.375,0.054167,0.541667,1.444444,0.016667,1.363636
30,(people),(their),0.079167,0.154167,0.041667,0.526316,3.41394,0.029462,1.785648
11,(but),(sleep),0.066667,0.375,0.033333,0.5,1.333333,0.008333,1.25
