-
Notifications
You must be signed in to change notification settings - Fork 5
/
__init__.py
596 lines (477 loc) · 22.7 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
from __future__ import division
from __future__ import print_function
import os
import math
import errno
import json
import operator
import numpy as np
import praw
import urllib2
import nltk
from nltk.stem.porter import PorterStemmer
from collections import Counter, OrderedDict
from time import time, sleep
from string import punctuation
from praw.handlers import MultiprocessHandler
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
class WordCounter(object):
"""Performs word counting given an input string.
Data attributes:
stemmer: Porter stemmer used optionally to perform stemming of extracted words
stopwords (list): list of stop words used to reject common words such as 'and'
Methods:
tokenize
get_word_count
remove_punctuation
remove_stopwords
stem_tokens: perform Porter stemming on a list of words
"""
def __init__(self):
self.stemmer = PorterStemmer()
# Load stop-words
application_root = os.path.dirname(__file__)
stopwords = os.path.join(application_root, 'words/stopwords_english.txt')
with open(stopwords, 'rb') as stopwords_file:
self.stopwords = [word.strip('\n') for word in stopwords_file.readlines()]
def tokenize(self, text):
"""Tokenize an input string into a list of words (with punctuation removed)."""
text = text.lower()
punctuation_removed = self.remove_punctuation(text)
tokens = nltk.word_tokenize(punctuation_removed)
return tokens
def get_word_count(self, text, stop_words=True, stemming=False):
"""Return a dict (Counter) of words and corresponding counts given an input string."""
tokens = self.tokenize(text)
# Remove stop words
if stop_words:
tokens = self.remove_stopwords(tokens)
if stemming:
tokens = self.stem_tokens(tokens)
return Counter(tokens)
@staticmethod
def remove_punctuation(text, replacement=' ', exclude="'"):
"""Remove punctuation from an input string."""
text = text.replace("'", "") # Single quote always stripped out
for p in set(list(punctuation)) - set(list(exclude)):
text = text.replace(p, replacement)
text = ' '.join(text.split()) # Remove excess whitespace
return text
def remove_stopwords(self, tokens):
"""Remove all stopwords from a list of word tokens."""
return [word for word in tokens if word not in self.stopwords]
def stem_tokens(self, tokens):
"""Perform porter stemming on a list of word tokens."""
return [self.stemmer.stem(word) for word in tokens]
def count_words_from_list(self, text, word_list, normalize=True):
"""Count the number of times the words from a given list appear in text."""
text = self.tokenize(text)
count = sum([1 for word in text if word in word_list])
if normalize:
count /= len(text)
return count
class RedditWordCounter(WordCounter):
"""Performs word counting of comments and titles in Reddit using the Reddit API.
To initialise a new RedditWordCounter instance:
>>> counter = RedditWordCounter('your_username')
To adhere to the Reddit API rules, please provide your Reddit username in place of 'your_username' above.
This will ensure that the app doesn't get banned from Reddit!
Data Attributes:
user_agent (str): required to connect to Reddit
reddit: instance of the Reddit API connection
word_counter: WordCounter object used to perform word counting given input strings
Methods:
subreddit_comments: word count from comments of a given subreddit
subreddit_titles: word count from titles of a given subreddit
user_comments: word count from comments of a given user
check_connection: check that there is a working connection to Reddit
"""
def __init__(
self,
user,
multiprocess=False
):
"""Initialise a RedditWordCounter object.
:param user: your Reddit username
:param multiprocess: if True, will handle requests from multiple RedditWordCounter objects (False by default)
:return:
"""
super(RedditWordCounter, self).__init__() # Initialise the WordCounter class
handler = MultiprocessHandler() if multiprocess else None
self.user_agent = 'redditvocab/0.1 bot by {0}'.format(user)
self.reddit = praw.Reddit(user_agent=self.user_agent, handler=handler)
def subreddit_comments(self, subreddit_name, limit=1000, stemming=False, get_all_comments=False):
"""Retrieve the vocabulary from the comments of a subreddit.
:param subreddit_name: name of the subreddit excluding '/r/'
:param limit: number of comments to retrieve (1000 by default) - note that at present the limit is approximate
:param stemming: if True, performs stemming on tokenized words (False by default)
:param get_all_comments: if True, retrieves all comments per submission. Note that this requires descending the
comment tree, which drastically increases the number of API calls and reduces performance due to rate-limiting.
:return: Counter (dict) of comment vocabulary in the form {'term1': freq, 'term2': freq, ...}
"""
def get_vocabulary(comments):
vocab = Counter()
num_comments = 0
for comment in comments:
if isinstance(comment, praw.objects.Comment):
try:
# Get the word counts for the comment
vocab += self.get_word_count(comment.body, stemming=stemming)
num_comments += 1
except ValueError:
pass
elif isinstance(comment, praw.objects.MoreComments) and get_all_comments:
new_vocab, num_new_comments = get_vocabulary(comment.comments)
vocab += new_vocab
num_comments += num_new_comments
return vocab, num_comments
subreddit = self.reddit.get_subreddit(subreddit_name)
# Initialise loop variables
vocabulary = Counter()
comments_processed = 0
for submission in subreddit.get_hot(limit=None):
submission_comments = praw.helpers.flatten_tree(submission.comments)
# Run over all comments
submission_vocabulary, new_comments = get_vocabulary(submission_comments)
vocabulary += submission_vocabulary
comments_processed += new_comments
print("Comments processed for subreddit '{0}': {1}".format(subreddit_name, comments_processed), end="\r")
if limit and comments_processed >= limit:
break
print('\n')
return vocabulary
def subreddit_titles(self, subreddit_name, limit=1000, stemming=False):
"""Retrieve the vocabulary from the titles in a subreddit.
:param subreddit_name: name of the subreddit excluding '/r/'
:param limit: number of submissions to process (1000 by default - note that this is the maximum)
:param stemming: if True, performs stemming on tokenized words (False by default)
:return: Counter (dict) of title vocabulary in the form {'term1': freq, 'term2': freq, ...}
"""
subreddit = self.reddit.get_subreddit(subreddit_name)
# Initialise loop variables
vocabulary = Counter()
submissions_processed = 0
for submission in subreddit.get_hot(limit=limit):
try:
# Update the word counter to include the comment
vocabulary += self.get_word_count(submission.title, stemming=stemming)
submissions_processed += 1
if submissions_processed % 100 == 0 or submissions_processed >= limit:
print("Titles processed for subreddit '{0}': {1}".format(subreddit_name, submissions_processed),
end="\r")
except ValueError:
pass
print('\n')
return vocabulary
def user_comments(self, username, limit=1000, stemming=False):
"""Retrieve the vocabulary of a user's comments.
:param username: user's Reddit username excluding '/u/'
:param limit: number of comments to process (1000 by default - note that this is the maxmimum)
:param stemming: if True, performs stemming on tokenized words (False by default)
:return: Counter (dict) of user's vocabulary in the form {'term1': freq, 'term2': freq, ...}
"""
user = self.reddit.get_redditor(username)
vocabulary = Counter()
comments_processed = 0
for comment in user.get_comments(limit=limit):
try:
# Get the word counts for the comment
vocabulary += self.get_word_count(comment.body, stemming=stemming)
comments_processed += 1
if comments_processed % 100 == 0 or comments_processed >= limit:
print("Comments processed for user '{0}': {1}".format(username, comments_processed), end="\r")
except ValueError:
pass
print('\n')
return vocabulary
def check_connection(self, timeout=10):
"""Wait for a server response."""
header = {'User-Agent': self.user_agent}
start = time()
while True:
try:
request = urllib2.Request("http://www.reddit.com/", headers=header)
response = urllib2.urlopen(request)
response.read()
sleep(2) # Adhere to Reddit API rule of 30 requests per minute
if response.getcode() == 200:
return True
except urllib2.HTTPError as err:
print(err)
finally:
if time() - start > timeout:
return False
class TfidfCorpus(object):
"""Stores features (e.g. words) and their document frequencies in an inverted index. Useful for NLP and machine
learning applications.
To initialise a new TfidfCorpus instance:
>>> corpus = TfidfCorpus()
By default the corpus will save to 'tfidf_corpus/corpus.json'. You can specify an existing file to load
or a specific save path as follows:
>>> corpus = TfidfCorpus(corpus_path='path/to/corpus.json')
Data Attributes:
corpus_path (str): save/load path of the corpus
document_list (list): list of strings indicating the documents stored in the corpus
document_lengths (dict): sum of word frequencies contained in each document, takes the form:
{
"document1": int,
"document2": int,
...
}
corpus (dict): dict of Counters that takes the form:
{
"term1": {
"document1": int,
"document2": int
},
"term2": {
"document1": int,
"document2": int,
},
...
}
Methods:
save
load
get_corpus_path
get_document_list
add_document
get_document
delete_document
append_document
get_idf
get_tfidf
get_document_tfidfs
get_top_terms
build_feature_matrix
train_classifier
classify_document
count_words_from_list
get_mean_word_length
check_corpus_path
"""
def __init__(self, corpus_path='corpus.json'):
# Check that the corpus path is valid
self.check_corpus_path(corpus_path)
self.corpus_path = corpus_path
self.document_list = list()
self.document_lengths = dict()
self.corpus = dict()
# Initialise scikit-learn attributes
self.vectorizer = None
self.tfidf_transformer = None
self.feature_matrix = None
self.classifier = None
if os.path.isfile(corpus_path):
self.load()
def save(self, path=''):
"""Save the corpus to a JSON file at the path specified in self.corpus_path.
:param path: you can specify a save path (must end in .json), which will change self.corpus_path
"""
if path:
self.check_corpus_path(path)
self.corpus_path = path
with open(self.corpus_path, 'wb') as save_file:
json.dump(
{
'document_list': self.document_list,
'document_lengths': self.document_lengths,
'corpus': self.corpus
},
save_file
)
def load(self):
"""Load the corpus from a JSON file. File path defined in self.corpus_path."""
with open(self.corpus_path, 'rb') as load_file:
data = json.load(load_file)
try:
self.document_list = data['document_list']
self.document_lengths = data['document_lengths']
self.corpus = data['corpus']
# Make sure that frequency dicts in corpus are Counter objects
for term in self.corpus.iterkeys():
self.corpus[term] = Counter(self.corpus[term])
except KeyError as err:
print('Provided file does not have expected structure')
raise err
def get_corpus_path(self):
return self.corpus_path
def set_corpus_path(self, path):
if not path.lower().endswith('.json'):
raise Exception('Corpus path must be a JSON file (.json extension).')
self.corpus_path = path
def get_document_list(self):
return self.document_list
def get_vocabulary(self):
"""Return the full list of terms in the corpus."""
return self.corpus.keys()
def get_document(self, document_name):
"""Retrieve a document from the corpus."""
if document_name not in self.document_list:
raise Exception("No document with name '{0}' found in corpus".format(document_name))
return Counter({
term: freqs[document_name] for term, freqs in self.corpus.iteritems() if freqs.get(document_name, 0)
})
def add_document(self, document, document_name):
"""Load a document into the corpus.
:param document: takes the form {'term1': freq1, 'term2', freq2, ...}
:param document_name: string which uniquely identifies the document
"""
if document_name in self.document_list:
print("Document with name '{0}' already exists in corpus."
"Do you wish to replace it?".format(document_name))
while True:
replace_doc = raw_input("Response (y/n): ")
if replace_doc in ['y', 'yes', 'ye']:
self.delete_document(document_name)
break
elif replace_doc in ['n', 'no']:
return
else:
print('Could not interpret response. Try again.')
for term, freq in document.iteritems():
if not self.corpus.get(term, False):
self.corpus[term] = Counter()
self.corpus[term][document_name] = freq
self.document_list.append(document_name)
self.document_lengths[document_name] = sum(document.itervalues())
def delete_document(self, document_name):
"""Delete a document from the corpus.
:param document_name: string indicating document's name in the corpus - should exist in self.document_list
"""
if document_name not in self.document_list:
return
[freqs.pop(document_name) for term, freqs in self.corpus.iteritems() if freqs.get(document_name, 0)]
self.document_list.remove(document_name)
self.document_lengths.pop(document_name)
def append_document(self, document, document_name):
"""Add new counts to an existing document. If the document doesn't exist in the corpus then it is added.
:param document: dict or Counter of word counts, e.g. {'i': 1, 'like': 2, 'cheese': 1}
:param document_name: string indicating document's name in the corpus - should exist in self.document_list
"""
if document_name not in self.document_list:
self.add_document(document, document_name)
else:
for term, freq in document.iteritems():
if not self.corpus.get(term, False):
self.corpus[term] = Counter()
self.corpus[term][document_name] += freq
self.document_lengths[document_name] += sum(document.itervalues())
def get_idf(self, term):
"""Get inverse document frequency of a given term in the corpus."""
num_documents = len(self.document_list)
docs_containing_term = len(self.corpus[term])
return math.log(num_documents / (1 + docs_containing_term))
def get_tfidf(self, term, document_name):
"""Get tf-idf score given a term and document in the corpus."""
tf = self.corpus[term].get(document_name, '') / self.document_lengths[document_name]
idf = self.get_idf(term)
return tf * idf
def get_document_tfidfs(self, document_name, l2_norm=True):
"""Get tf-idf scores for all terms in a document.
:param document_name: string indicating document's name in the corpus - should exist in self.document_list
:param l2_norm: if True, applies Euclidean normalization to tf-idf scores of the document
:return: Counter of tf-idf scores for each term
"""
tfidfs = {
term: self.get_tfidf(term, document_name) for term, freq in self.corpus.iteritems()
if freq.get(document_name, '')
}
if l2_norm:
normalization = np.linalg.norm(tfidfs.values(), axis=0)
for key, value in tfidfs.items():
tfidfs[key] = value / normalization
return Counter(tfidfs)
def get_top_terms(self, document_name, num_terms=30):
"""Get the top terms for a given document by tf-idf score.
:param document_name: string indicating document's name in the corpus - should exist in self.document_list
:param num_terms: number of top terms to return (30 by default)
:return: dict of top terms and their corresponding tf-idf scores
"""
tfidfs = self.get_document_tfidfs(document_name)
sorted_tfidfs = sorted(tfidfs.items(), key=operator.itemgetter(1), reverse=True)
return OrderedDict(sorted_tfidfs[:num_terms])
def build_feature_matrix(self, tfidf=True):
"""Transforms the corpus into a scikit-learn vectorizer object which can be used for machine learning.
Used to set the object attributes self.vectorizer and self.feature_matrix.
:param tfidf (bool): if True, applies TfidfTransformer to vectorized features
:return: scikit-learn vectorizer, scipy sparse feature matrix and its corresponding document labels
"""
train_data = [self.get_document(document) for document in self.document_list]
labels = self.document_list
vectorizer = DictVectorizer()
feature_matrix = vectorizer.fit_transform(train_data)
self.tfidf_transformer = None
if tfidf:
self.tfidf_transformer = TfidfTransformer()
feature_matrix = self.tfidf_transformer.fit_transform(feature_matrix)
self.vectorizer = vectorizer
self.feature_matrix = feature_matrix
return feature_matrix, labels, vectorizer
def train_classifier(self, classifier_type='LinearSVC', tfidf=True):
"""Trains a document classifier using the vocabulary and documents contained in the corpus. Uses scikit-learn.
:param classifier_type (str): 'LinearSVC' or 'MultinomialNB' (LinearSVC by default)
:param tfidf (bool): if True, applies TfidfTransformer to vectorized features
:return: classifier object
"""
self.build_feature_matrix(tfidf=tfidf)
if classifier_type.lower() == 'linearsvc':
classifier = OneVsRestClassifier(LinearSVC(random_state=0))
elif classifier_type.lower() == 'multinomialnb':
classifier = OneVsRestClassifier(MultinomialNB())
else:
raise Exception("Parameter classifier_type only accepts 'MultinomialNB', 'BernoulliNB' or 'LinearSVC'.")
classifier.fit(self.feature_matrix, self.document_list)
self.classifier = classifier
return classifier
def classify_document(self, document):
"""Classifies an input document using a bag-of-words approach with sparse features.
:param document (dict): dict or Counter of the form {'word1': freq1, 'word2': freq2, ...}
:return (str): label corresponding to the document's classification
"""
test_data = self.vectorizer.transform([document])
if self.tfidf_transformer:
test_data = self.tfidf_transformer.transform(test_data)
return self.classifier.predict(test_data)
def count_words_from_list(self, document_name, word_list, normalize=True):
"""Given a list of input words, return the counts of these words in a specified document."""
document = self.get_document(document_name)
word_counts = [document[word] for word in word_list]
total_count = sum(word_counts)
if normalize:
total_count /= self.document_lengths[document_name]
return total_count
def get_mean_word_length(self, document_name, upper_limit=12):
"""Get the average word length for all words in a given document."""
document = self.get_document(document_name)
return sum([len(term) * freq for term, freq in document.iteritems()
if len(term) <= upper_limit]) / sum(document.itervalues())
@staticmethod
def check_corpus_path(corpus_path):
if not corpus_path.lower().endswith('.json'):
raise Exception('corpus_path provided is not a valid JSON file.')
make_path(corpus_path)
def make_path(path):
"""Check if path exists. If it doesn't, create the necessary folders."""
# Remove file name from path
base_name = os.path.basename(path)
if '.' in base_name:
path = path[:-len(base_name)]
if not os.path.exists(path):
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
return path
def get_word_corpora():
"""Returns a list of paths to all word corpora installed in the module."""
application_root = get_root_dir()
words_dir = os.path.join(application_root, 'words')
return os.listdir(words_dir)
def get_root_dir():
return os.path.dirname(__file__)