This repository has been archived by the owner on Nov 11, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Scorer.py
241 lines (204 loc) · 7.88 KB
/
Scorer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#from nltk.corpus import movie_reviews
from textblob import TextBlob, Word
from SearchPacket import SearchPacket
from TextPreprocessor import TextPreprocessor
from sklearn.linear_model import LogisticRegression
from math import ceil
import numpy as np
'''
This class will be fed words and their significance.
It will score a sentence based on whether those words appear,
and if they do, how significant they are to our attribute.
@author: Justin A. Middleton
@date: 26 Apr 2015
'''
class Scorer():
'''
What I expect: a list of tuples
A SearchPacket with attributes.
'''
def __init__(self, searchPacket = None):
if searchPacket is None or not isinstance(searchPacket, SearchPacket):
raise ValueError("__init__: must have search packet parameter.")
self.packet = searchPacket
self.points = []
self.graphs = []
'''
Scores an input sentence, currently using the pattern analyzer
as part of text blob.
Ignore subjectivity. Use absolute value of polarity.
'''
def score(self, text):
if not isinstance(text, basestring):
raise ValueError("score: text must be a string.")
processed = TextPreprocessor(text)
words = processed.get_words()
polarity = TextBlob(processed.get_raw()).sentiment.polarity
scores = []
for attr in self.packet.getAttributes():
attrScore = 0
for i in range(0, attr.get_size()):
word = attr.get_word(i)
multiword = len(word.split()) > 1
expectedSent = attr.get_sentiment_num(i)
if polarity * expectedSent >= 0:
significance = attr.get_weight_num(i)
if multiword:
spacedText = " ".join(processed.get_tokens())
attrScore += spacedText.count(word) * significance
else:
attrScore += words.count(word) * significance
scores.append(attrScore)
#Fill it up to 5
for i in range(len(scores),5):
scores.append(0)
return scores
'''
Takes in the list of posts from the database and adds them all together to get the total
score array for this user.
This sum will be used to generate the logistic regression and then retrieve a probability from it.
post: I assume it to be a dictionary from the database, where values "score1" through "score5" are the
scores from the above function.
'''
def sum_posts(self, posts):
if not isinstance(posts, list):
raise ValueError("sum_posts: parameter posts must be a list")
elif len(posts) > 0 and not isinstance(posts[0], dict):
raise ValueError("sum_posts: parameter posts must be populated with dicts")
sumVector = [0,0,0,0,0]
for post in posts:
keys = post.keys()
if not ("score1" in keys and "score2" in keys and "score3" in keys \
and "score4" in keys and "score5" in keys):
continue
postArray = [post["score%d" % i] for i in range (1, 6)]
sumVector = [x+y for x, y in zip(sumVector, postArray)]
return sumVector
'''Add a datapoint to the list being maintained in this class.'''
def add_point(self, point):
if not isinstance(point, list):
raise ValueError("add_point: parameter point must be a list")
self.points.append(point)
'''
Finds the graph for each of the individual dimensions.
Special cases:
If no datapoints with an actual score
If no datapoints for an individual dimension
...then add None instead of a logistic regression
and this will be taken care of later.
If two datapoints
...then the yAxis will be [0,1]
Otherwise:
A logistic regression needs a classification, and here it's either 0 (not what we're looking
for) or 1 (what we're looking for). We don't know this beforehand, but we do know the weighted
word frequencies of each user.
Here, this function makes a logistic regression for every individual regression for all the
users added to the scorer.
It then scatters a number of 1s throughout the set (based on different incrementations upon
the standard deviation) and fits the graph to be used later.
'''
def fit_graphs(self):
for i in range(0, 5):
dimension = [x[i] for x in self.points if sum(x) > 0]
dimension.sort()
#The moment we hit an array with zero as the highest, then there's nothing to be done.
if len(dimension) < 2 or dimension[-1] == 0:
self.graphs.append(None)
continue
#Quick stats information.
avg = sum(dimension, 0.0) / len(dimension)
std = self.getSTD(dimension)
#Sets highest as a hit.
yAxis = [0 for i in range(0, len(dimension))]
yAxis[-1] = 1
#Sets a certain number of users as hits.
numToHit = ceil(len(dimension) / 10.0)
numToHit = int(numToHit)
#Scatter the number of hits about the upper half of the dataset. The increments are even,
#but the use of the standard deviation with the average will put most of the points
#toward the higher end.
if numToHit > 1:
increment = 2.0 / (numToHit - 1)
for j in range(0, numToHit):
self.setClosestTo(dimension, yAxis, avg + (j*increment)*std)
else:
if len(yAxis) >= 3:
yAxis[-2] = 1
#If the length is but 2, then the lower number should be 0.
if len(yAxis) == 2:
yAxis[0] = 0
#Create the logistic regression.
l = LogisticRegression(C=1.0)
data = np.array([[d] for d in dimension])
results = np.array(yAxis)
l.fit(data, results)
self.graphs.append(l)
'''Sends the points through the numpy standard deviation finder and returns the results.'''
def getSTD(self, dimension):
if not isinstance(dimension, list):
raise ValueError("getSTD: parameter dimension must be a list")
elif len(dimension) == 0:
raise ValueError("getSTD: parameter dimension must not be empty")
elif not isinstance(dimension[0], int):
raise ValueError("getSTD: values in dimension must be ints")
dimensionNP = np.array(dimension)
return np.std(dimensionNP)
'''
Finds the value in dimension which is equal to or just above limit.
If limit is already considered a hit, set the one below it as a hit.
If that one is a hit too, oh well, consider it a loss. We don't want to go back
all the way.
'''
def setClosestTo(self, dimension, yAxis, limit):
if len(dimension) != len(yAxis):
raise ValueError("setClosestTo: the two lists must be the same length")
elif len(dimension) < 2:
raise ValueError("setClosestTo: there must be at least two values in the list")
found = False
#Never let the lowest be a 1.
for i in range(1, len(dimension)):
if dimension[i] >= limit:
if yAxis[i] == 0:
yAxis[i] = 1
elif i > 1:
yAxis[i-1] = 1
found = True
break
if not found:
yAxis[-1] = 1
'''
Once the regression has been calculated, this will retrieve the probability
of matching the "1" class by using the "predict_proba" function from the
sklearn logistic regression class.
What this really does is get a weighted average of every attribute and its
own logistic regression.
point: list of five numbers, representing the user's total score, with each
number being the score for each attribute
'''
def get_prob(self, point):
if not isinstance(point, list):
raise ValueError("get_prob: parameter point must be a list")
elif len(point) != 5 or not isinstance(point[0], int):
raise ValueError("get_prob: parameter point must be a int list of length 5")
elif len(self.graphs) == 0:
raise ValueError("get_prob: graphs not yet fit")
prob = 0
ctr = 0
for i, attr in zip(range(0, 5), self.packet.getAttributes()):
attrWeight = attr.get_attr_weight_num()
logit = self.graphs[i]
#If logit is None, then we had determined there was nothing to be gained
#from this regression. Perhaps it had no valuable datapoints, or not enough.
if logit is None:
continue
#Right now, it ignores the probability if no instance of this word was found
#This leaves the question of whether the absence of a word is more significant than
#its presence. But for now, ignore that.
num = point[i]
if num > 0:
predictedProb = logit.predict_proba([num])[0][1]
prob += predictedProb*attrWeight
ctr += attrWeight
if ctr > 0:
prob /= ctr
return prob