/
wordfreq.py
69 lines (53 loc) · 2.01 KB
/
wordfreq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""Count the frequencies of words in a string"""
from __future__ import division
from __future__ import print_function
import cmath as math
def wordfreq(text, is_filename=False):
"""Return a dictionary of words and word counts in a string."""
if is_filename:
with open(text) as f:
text = f.read()
freqs = {}
for word in text.split():
lword = word.lower()
freqs[lword] = freqs.get(lword, 0) + 1
return freqs
def print_wordfreq(freqs, n=10):
"""Print the n most common words and counts in the freqs dict."""
words, counts = freqs.keys(), freqs.values()
items = zip(counts, words)
items.sort(reverse=True)
for (count, word) in items[:n]:
print(word, count)
def wordfreq_to_weightsize(worddict, minsize=25, maxsize=50, minalpha=0.5, maxalpha=1.0):
mincount = min(worddict.itervalues())
maxcount = max(worddict.itervalues())
weights = {}
for k, v in worddict.iteritems():
w = (v-mincount)/(maxcount-mincount)
alpha = minalpha + (maxalpha-minalpha)*w
size = minsize + (maxsize-minsize)*w
weights[k] = (alpha, size)
return weights
def tagcloud(worddict, n=10, minsize=25, maxsize=50, minalpha=0.5, maxalpha=1.0):
from matplotlib import pyplot as plt
import random
worddict = wordfreq_to_weightsize(worddict, minsize, maxsize, minalpha, maxalpha)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_position([0.0,0.0,1.0,1.0])
plt.xticks([])
plt.yticks([])
words = worddict.keys()
alphas = [v[0] for v in worddict.values()]
sizes = [v[1] for v in worddict.values()]
items = zip(alphas, sizes, words)
items.sort(reverse=True)
for alpha, size, word in items[:n]:
# xpos = random.normalvariate(0.5, 0.3)
# ypos = random.normalvariate(0.5, 0.3)
xpos = random.uniform(0.0,1.0)
ypos = random.uniform(0.0,1.0)
ax.text(xpos, ypos, word.lower(), alpha=alpha, fontsize=size)
ax.autoscale_view()
return ax