-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifier.py
52 lines (47 loc) · 1.75 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from rdt.data.mongo.source import Source
import rdt.nlp.pos as pos
from nltk.corpus import stopwords
from nltk.classify import PositiveNaiveBayesClassifier
import rdt.nlp.annotate as annotate, rdt.nlp.conll_get as conll_get
import rdt.nlp.ngrams as ngrams
def feature(document):
"""This builds bag of words features and also adds subreddit as a feature.
:param document: The dictionary reddit document.
:type document: dict
"""
final = filter_words(document["cleansed_text"])
return dict(('contains(%s)' % w, True) for w in final)
def ner_feature(document,tagger=None):
if tagger is None:
tagger = ngrams.make_backoff_tagger()
sents = annotate.dirty_dict(document,tagger=tagger)
fts = {}
for noun in sents["nouns"]:
nn = "contains_noun(" + noun + ")"
fts[nn] = True
return fts
def filter_words(text):
"""Prepares the reddit document for bag of words. Turns the text
into an array of strings, without stopwords
:param text: blob of text
:type text: [str]
"""
sents = pos.tokenize_words(pos.tokenize_sents(text))
final = []
"""turn the list of sentences into a list of words"""
for sent in sents:
final.extend(sent)
stop = stopwords.words('english')
final = [w for w in final if w.lower() not in stop]
final = [w.lower() for w in final]
return final
def subreddit(subreddit=None,batch_size=100):
if subreddit is None:
return None
source = Source(host="localhost",port=27017,database="reddit_stream",collection="combined")
cursor = source.find_clean({"subreddit" : subreddit},batch_size=batch_size)
return cursor
"""other = source.find_clean()[:1500]"""
def positive_naive_bayes(pos_cursor,unlabeled_cursor):
"""send over entire documents"""
return PositiveNaiveBayesClassifier.train(list(map(feature, pos_cursor)), list(map(feature, unlabeled_cursor)))