# Keyword analysis


In [45]:
from collections import Counter

Read target corpus and reference corpus

In [3]:
brown_c = open('brown-c.txt').read().split()
brown_r = open('brown-r.txt').read().split()


In [4]:
len(brown_c), len(brown_r)

(39399, 1121793)

In [5]:
brown_r[:10]

['Dan',
 'Morgan',
 'told',
 'himself',
 'he',
 'would',
 'forget',
 'Ann',
 'Turner',
 '.']

Find most frequent words

In [7]:
f_c = Counter(brown_c)
f_r = Counter(brown_r)

In [8]:
f_c.most_common(10)

[('the', 2295),
 (',', 1913),
 ('of', 1494),
 ('.', 1382),
 ('and', 921),
 ('to', 882),
 ('in', 724),
 ('a', 655),
 ('is', 533),
 ('that', 475)]

In [9]:
f_r.most_common(10)

[('the', 60418),
 (',', 56421),
 ('.', 47964),
 ('of', 34586),
 ('and', 26994),
 ('to', 24850),
 ('a', 21226),
 ('in', 18812),
 ('that', 9762),
 ('was', 9563)]

Normalize text

In [10]:
brown_c = [ word.lower() for word in brown_c if word.isalpha() ]
brown_r = [ word.lower() for word in brown_r if word.isalpha() ]

In [11]:
brown_r[:10]

['dan',
 'morgan',
 'told',
 'himself',
 'he',
 'would',
 'forget',
 'ann',
 'turner',
 'he']

In [12]:
f_c = Counter(brown_c)
f_r = Counter(brown_r)

In [13]:
f_c.most_common(10)

[('the', 2480),
 ('of', 1505),
 ('and', 962),
 ('to', 904),
 ('in', 774),
 ('a', 694),
 ('is', 537),
 ('that', 492),
 ('it', 337),
 ('for', 315)]

In [14]:
f_r.most_common(10)

[('the', 67491),
 ('of', 34907),
 ('and', 27891),
 ('to', 25254),
 ('a', 22501),
 ('in', 20563),
 ('that', 10102),
 ('was', 9601),
 ('is', 9572),
 ('he', 9342)]

simple maths parameter (Brezina p. 85)

In [16]:
n_c = f_c.total()/ 1_000_000
n_r = f_r.total()/ 1_000_000

In [17]:
[(w, f/n_c) for (w,f) in f_c.most_common(10)]

[('the', 73031.39171918252),
 ('of', 44319.45344248778),
 ('and', 28329.112433005477),
 ('to', 26621.120207314918),
 ('in', 22792.861770422285),
 ('a', 20437.010424642205),
 ('is', 15813.652158548795),
 ('that', 14488.485776547499),
 ('it', 9924.023794098592),
 ('for', 9276.16467400907)]

In [31]:
k = 1
keywords = [ ((f_c[w]/n_c + k) / (f_r[w]/n_r + k), w) for w in f_c ]

In [34]:
sorted(keywords)[: 10]

[(0.057055439884056654, 'af'),
 (0.05987675949967377, 'got'),
 (0.07402750786479025, 'her'),
 (0.08898406065774739, 'off'),
 (0.09952644640615906, 'company'),
 (0.11318973825678033, 'office'),
 (0.11685657821268, 'heard'),
 (0.11733170585426918, 'federal'),
 (0.11789064963474691, 'she'),
 (0.11829364710608645, 'street')]

In [35]:
sorted(keywords)[-10:]

[(236.5851345780081, 'guideposts'),
 (236.5851345780081, 'yin'),
 (258.41124716166667, 'realtors'),
 (266.0332764002591, 'han'),
 (266.0332764002591, 'saviour'),
 (301.3986901558779, 'lo'),
 (324.92956004476116, 'mahayana'),
 (354.37770186701215, 'yang'),
 (413.2739855115142, 'irenaeus'),
 (619.4109782672713, 'shu')]

In [36]:
sorted(keywords, reverse = True)[:10]

[(619.4109782672713, 'shu'),
 (413.2739855115142, 'irenaeus'),
 (354.37770186701215, 'yang'),
 (324.92956004476116, 'mahayana'),
 (301.3986901558779, 'lo'),
 (266.0332764002591, 'saviour'),
 (266.0332764002591, 'han'),
 (258.41124716166667, 'realtors'),
 (236.5851345780081, 'yin'),
 (236.5851345780081, 'guideposts')]

In [37]:
k = 100
keywords = [ ((f_c[w]/n_c + k) / (f_r[w]/n_r + k), w) for w in f_c ]

In [42]:
sorted(keywords)[: 10]

[(0.1018496095612704, 'her'),
 (0.13831610143644443, 'af'),
 (0.1460159975374528, 'she'),
 (0.20579421182600732, 'off'),
 (0.21307861844345138, 'got'),
 (0.3027603032972011, 'house'),
 (0.31968016742372235, 'company'),
 (0.35117922171802907, 'high'),
 (0.3517603916245708, 'office'),
 (0.3578989666096433, 'want')]

In [43]:
sorted(keywords)[-10:]

[(7.534575794163571, 'zen'),
 (7.793799706671908, 'church'),
 (7.8943455668504585, 'christian'),
 (7.91391066683639, 'sin'),
 (7.955073271815822, 'faith'),
 (7.996521228865675, 'spirit'),
 (8.140972014659157, 'jesus'),
 (8.621129780092343, 'membership'),
 (13.115709397595085, 'christ'),
 (14.158803765162455, 'god')]

In [44]:
sorted(keywords, reverse=True)[: 10]

[(14.158803765162455, 'god'),
 (13.115709397595085, 'christ'),
 (8.621129780092343, 'membership'),
 (8.140972014659157, 'jesus'),
 (7.996521228865675, 'spirit'),
 (7.955073271815822, 'faith'),
 (7.91391066683639, 'sin'),
 (7.8943455668504585, 'christian'),
 (7.793799706671908, 'church'),
 (7.534575794163571, 'zen')]

I think the target corpus is more related to Christianity section of the Brown Corpus