-
Notifications
You must be signed in to change notification settings - Fork 0
/
hp_classifiers.py
109 lines (87 loc) · 3.78 KB
/
hp_classifiers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
####################################################################
# Licence: Creative Commons (see COPYRIGHT) #
# Authors: Nikolaos Pappas, Georgios Katsimpras #
# {nik0spapp, gkatsimpras}@gmail.com #
# Supervisor: Efstathios stamatatos #
# stamatatos@aegean.gr #
# University of the Aegean #
# Department of Information and Communication Systems Engineering #
# Information Management Track (MSc) #
# Karlovasi, Samos #
# Greece #
####################################################################
import sys
import nltk
from lexicon import Lexicon
from stemming.porter2 import stem
class HpSubj:
"""
High precision subjective sentence classifier which uses an annotated
lexicon of words as features. It classifies a sentence as subjective
if it contains two or more of the strong subjective clues.
"""
def __init__(self, debug=False):
self.dictionary = Lexicon().words
self.debug = debug
def classify(self, sentence):
wdict = self.dictionary
words = nltk.word_tokenize(sentence)
strong_subjective_words_count = 0
subjective = False
for word in words:
word = word.lower()
check = [word, stem(word)]
for w in check:
if wdict.has_key(w) and wdict[w]['type'] == 'strongsubj':
strong_subjective_words_count += 1
if strong_subjective_words_count >= 2:
subjective = True
break
return subjective
class HpObj:
"""
High precision objective sentence classifier which uses an annotated
lexicon as training data. It classifies a sentence as objective if it
doesn't contain along with its previous and next sentence, not even
one strong subjective clue and at most one weak subjective clue.
"""
def __init__(self, debug=False):
self.lexicon = Lexicon()
self.dictionary = self.lexicon.words
self.debug = debug
def classify(self, current, previous="", next=""):
if self.debug:
print
print "current:", current
print "previous:", previous
print "next:", next
print
wdict = self.dictionary
words = nltk.word_tokenize(current)
prev_words = nltk.word_tokenize(previous)
next_words = nltk.word_tokenize(next)
words += prev_words + next_words
strong_subjective_words_count = 0
weak_subjective_words_count = 0
objective = True
for word in words:
word = word.lower()
check = [word, stem(word)]
for w in check:
if wdict.has_key(w):
if wdict[w]['type'] == 'strongsubj':
strong_subjective_words_count += 1
if strong_subjective_words_count > 0:
objective = False
break
elif wdict[w]['type'] == 'weaksubj':
weak_subjective_words_count += 1
if weak_subjective_words_count > 1:
objective = False
break
return objective
if __name__ == '__main__':
hpo = HpObj()
hps = HpSubj()
print "Objective: " + (str)(hpo.classify(sys.argv[1]))
print "Subjective: " + (str)(hps.classify(sys.argv[1]))