Extracting the association rules of an arbitrary document (relatively big). Each sentence is
a transaction. Use the modules to extract the information.

[article link](https://aeon.co/essays/being-underslept-and-out-of-sync-is-a-political-injustice)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
textArr = np.loadtxt('./Q4/article.txt',
                     dtype=str,
                     delimiter='\n',
                     unpack=True)

In [3]:
# split the text into sentences
flattenedSentencesArr = np.hstack(np.char.split(textArr, sep='.'))
data = pd.DataFrame(flattenedSentencesArr)
data.columns = ['sentences']
data.head()

Unnamed: 0,sentences
0,"For Uber drivers trying to make ends meet, it ..."
1,It saves on a few journeys and helps make the...
2,It keeps a driver readily available for work ...
3,There are carparks where the sleeping bags co...
4,


In [4]:
# remove line breaks
data['sentences'] = data['sentences'].apply(lambda x: x.replace('\n', ''))
# drop empty sentences
data['sentences'] = data['sentences'].replace('', np.nan)
data = data.dropna()
data.sample(5)

Unnamed: 0,sentences
92,The service sector offers many examples
59,"Harmful, undeserved and avoidable forms of in..."
2,It keeps a driver readily available for work ...
47,Then there are questions of privacy – exposur...
122,Fortunately the damage was only material


In [5]:
import re
data['sentences'] = data['sentences'].apply(lambda x: x.lower())
data['sentences'] = data['sentences'].apply(lambda x: re.sub(r'[^\w]', ',', x))
data['sentences'] = data['sentences'].apply(lambda x: x.split(','))

In [6]:
# delete empty list
data['sentences'] = data['sentences'].apply(lambda x: [str for str in x if str])
data.sample(5)

Unnamed: 0,sentences
148,"[and, individual, decisions, have, effects, on..."
191,"[the, fact, that, sleep, disadvantages, tend, ..."
103,"[in, more, affluent, sectors, the, rise, of, w..."
233,"[meanwhile, proposed, right, to, disconnect, l..."
264,"[allowing, people, to, live, less, cramped, li..."


In [131]:
data.shape

(240, 1)

In [132]:
%pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.19.0-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.19.0
Note: you may need to restart the kernel to use updated packages.


In [133]:
from mlxtend.preprocessing import TransactionEncoder

In [134]:
te = TransactionEncoder()
te_ary = te.fit(data['sentences']).transform(data['sentences'])
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,10,10pm,11am,1867,19,1900s,1941,1949,19th,2016,...,world,would,wrong,yawn,year,years,yet,yields,you,yourself
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
236,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
237,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
238,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [136]:
from mlxtend.frequent_patterns import apriori

In [216]:
df_apr = apriori(df, min_support=0.03, use_colnames=True)
df_apr

Unnamed: 0,support,itemsets
0,0.308333,(a)
1,0.033333,(also)
2,0.083333,(an)
3,0.400000,(and)
4,0.216667,(are)
...,...,...
632,0.033333,"(their, to, of, the)"
633,0.054167,"(the, to, of, with)"
634,0.033333,"(that, to, on, the)"
635,0.033333,"(that, to, sleep, the)"


In [218]:
df_apr.sort_values('support', ascending=False)

Unnamed: 0,support,itemsets,length
63,0.579167,(the),1
38,0.529167,(of),1
72,0.445833,(to),1
3,0.400000,(and),1
55,0.375000,(sleep),1
...,...,...,...
405,0.033333,"(more, of, and)",3
406,0.033333,"(and, of, on)",3
33,0.033333,(minorities),1
413,0.033333,"(on, and, the)",3


In [219]:
df_apr['length'] = df_apr['itemsets'].apply(lambda x: len(x))
df_apr.sample(3)

Unnamed: 0,support,itemsets,length
614,0.045833,"(the, in, of, with)",4
514,0.054167,"(more, of, the)",3
226,0.1,"(in, sleep)",2


In [221]:
df_apr[(df_apr['length'] >= 3) & (df_apr['support'] > 0.05)]

Unnamed: 0,support,itemsets,length
343,0.079167,"(a, of, and)",3
344,0.079167,"(a, and, the)",3
346,0.054167,"(be, a, of)",3
348,0.054167,"(be, a, to)",3
352,0.075000,"(a, in, of)",3
...,...,...,...
610,0.058333,"(in, sleep, of, the)",4
613,0.070833,"(to, in, of, the)",4
619,0.058333,"(is, sleep, of, the)",4
629,0.062500,"(to, sleep, of, the)",4


In [158]:
from mlxtend.frequent_patterns import association_rules

In [211]:
df_ar = association_rules(df_apr, metric='confidence', min_threshold=1)

In [212]:
df_ar.sample(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
7,"(more, sleep)",(of),0.041667,0.529167,0.041667,1.0,1.889764,0.019618,inf
3,(poor),(sleep),0.05,0.375,0.05,1.0,2.666667,0.03125,inf
4,"(do, and)",(the),0.033333,0.579167,0.033333,1.0,1.726619,0.014028,inf
1,(desynchronisation),(of),0.033333,0.529167,0.033333,1.0,1.889764,0.015694,inf
16,"(by, of, and)",(the),0.033333,0.579167,0.033333,1.0,1.726619,0.014028,inf


In [213]:
df_ar.sort_values('confidence', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(hour),(and),0.033333,0.4,0.033333,1.0,2.5,0.02,inf
1,(desynchronisation),(of),0.033333,0.529167,0.033333,1.0,1.889764,0.015694,inf
16,"(by, of, and)",(the),0.033333,0.579167,0.033333,1.0,1.726619,0.014028,inf
15,"(as, to, and)",(the),0.033333,0.579167,0.033333,1.0,1.726619,0.014028,inf
14,"(this, sleep)",(the),0.0375,0.579167,0.0375,1.0,1.726619,0.015781,inf
13,"(poor, the)",(sleep),0.041667,0.375,0.041667,1.0,2.666667,0.026042,inf
12,"(with, on)",(the),0.033333,0.579167,0.033333,1.0,1.726619,0.014028,inf
11,"(sleep, on)",(the),0.033333,0.579167,0.033333,1.0,1.726619,0.014028,inf
10,"(this, of)",(the),0.033333,0.579167,0.033333,1.0,1.726619,0.014028,inf
9,"(political, to)",(of),0.033333,0.529167,0.033333,1.0,1.889764,0.015694,inf
