/
count_words.py
111 lines (93 loc) · 3.68 KB
/
count_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# -*- coding: utf-8 -*-
"""
Created on Wed May 4 15:30:50 2016
@author: jamiehand, with help from Pete Johnson's CS150 examples
"""
import string
from stop_words import get_stop_words
def read_words(filename):
"""
Read lines from a file, strip newlines and punctuation from the lines,
split the lines into words, make each word lowercase, and return a
list of those words.
Ideas on translate (for removing puctuation) from:
http://stackoverflow.com/a/266162/4979097
and http://stackoverflow.com/a/12453622/4979097
"""
translate_table = str.maketrans("","", string.punctuation) # NOTE: this removes apostrophes, too
words = []
f = open(filename, "r")
# strip newlines at the end, remove punctuation from the line, then split
# into individual words -- then make each word lowercase and add it into
# the words list
for line in f:
#print(line)
words += (word.lower() for word in line.strip(string.whitespace).translate(translate_table).split())
f.close()
return words
def count_words(word_list):
"""
Traverse word_list to find out how many times each word appears.
Return a dictionary with word as the key and that word's frequency as
the value.
"""
word_with_frequency_dict = dict()
for word in word_list:
if word in word_with_frequency_dict:
word_with_frequency_dict[word] += 1
else:
word_with_frequency_dict[word] = 1
return word_with_frequency_dict
def remove_stop_words(word_dict):
"""
Take out stop words
"""
stop_words = get_stop_words('en')
# make keys into new list, so we can change the dict's size (i.e.
# delete keys) while iterating over the dict's keys
for word in list(word_dict.keys()):
if word in stop_words:
del word_dict[word]
def get_most_frequent(word_dict, num_words):
"""
Sort keys (words) by value and then extract the top num_words,
or duplicate the dictionary and then find the largest key/value pair in
the dictionary (get max value) and remove that key/value pair 10 times.
Ideas here: http://stackoverflow.com/a/280156/4979097
"""
most_freq = []
while(len(most_freq) < num_words and len(word_dict.keys()) > 0):
# max_word = max(word_with_frequency_dict,
# key=lambda x: word_with_frequency_dict[x[0]])
max_word = max(word_dict, key=word_dict.get) # dict.get() returns value for given key
most_freq.append((max_word, word_dict[max_word]))
# NOTE only removing from word_dict b/c we know we won't need it
# again! TODO way to do this w/o deleting from word_dict?
del word_dict[max_word]
return most_freq
def write_words_to_file(word_tuples, year):
"""
Take in a list of word tuples, and output them into a csv file,
each on its own line
"""
filename = "SOTU-csv/" + str(year)+".csv"
f = open(filename, "w")
f.write("word,freq,year\n")
for tup in word_tuples:
f.write(tup[0]+","+str(tup[1])+","+str(year)+"\n")
def txt_to_csv(input_filename, year):
word_list = read_words(input_filename)
word_dict = count_words(word_list)
remove_stop_words(word_dict)
word_tuples = get_most_frequent(word_dict,100000)
write_words_to_file(word_tuples, year)
print(word_tuples)
if __name__ == "__main__":
txt_to_csv("2002.txt", 2002)
# word_list = read_words("2002.txt")
# word_dict = count_words(word_list)
# remove_stop_words(word_dict)
# word_tuples = get_most_frequent(word_dict,25)
# write_words_to_file(word_tuples, 2002)
# TODO write file aggregator that outputs something like:
# word:___ freq:{[year:___, count:___], [year:___, count:___]}