-
Notifications
You must be signed in to change notification settings - Fork 1
/
MyClassifiers.py
148 lines (121 loc) · 4.69 KB
/
MyClassifiers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
'''
Created on Dec 9, 2015
@author: hanhanwu
'''
import re
import math
class classifier:
def __init__(self, getfeatures, filename = None):
# For each feature, count how many this feature in a category
self.fc = {}
# Count how many items in each category
self.cc = {}
self.getfeatures = getfeatures
# Increase the count of (feature, category) combination
def infc(self, feat, cate):
self.fc.setdefault(feat,{})
self.fc[feat].setdefault(cate,0)
self.fc[feat][cate] += 1
# Increase the count of category
def incc(self, cate):
self.cc.setdefault(cate,0)
self.cc[cate] += 1
# Return the number of a feature in a category
def fcount(self, feat, cate):
if feat in self.fc and cate in self.fc[feat]:
return float(self.fc[feat][cate])
return 0.0
# Return the number of items in a category
def ccount(self, cate):
if cate in self.cc:
return float(self.cc[cate])
return 0.0
# Return total number of items
def itemscount(self):
return sum(self.cc.values())
# Return a list of all categories
def categories(self):
return self.cc.keys()
# Get the train data by using getfeatures method
def train(self, item, cate):
features = self.getfeatures(item)
for feat in features:
self.infc(feat, cate)
self.incc(cate)
# Calculate the probability that a feature appears in a category
def fprob(self, feat, cate):
if self.ccount(cate) == 0:
return 0
return self.fcount(feat, cate)/self.ccount(cate)
# Calculated weighted probability, the assumed probability ap starts with 0.5
# by using assumed probabilities, when a word does not in the training data for this category, at least has a 0.5 probability
def weightedprob(self, feat, cate, myweight = 1.0, ap = 0.5):
basicprob = self.fprob(feat, cate)
# Count the number of this feature appeared in all categories
totals = sum(self.fcount(feat, c) for c in self.categories())
# Calculate weighted average
bp = ((myweight*ap)+(totals*basicprob))/(myweight+totals)
return bp
# Fisher classifier, fit chi-square distribution
class fisherclassifier(classifier):
def cprob(self, feat, cate):
# The frequency of this feature in this category
clf = self.fprob(feat, cate)
if clf == 0:
return 0.01
# The frequency of this feature appear in all the categories
freqsum = sum(self.fprob(feat, c) for c in self.categories())
p = clf/(freqsum)
return p
# Combine probabilities of the individual features to get the overall probabilities
def fisherprob(self, features, cate):
fp = 1
for feat in features:
fp*=(self.weightedprob(feat, cate, self.cprob(feat, cate)))
fscore = (-2)*math.log(fp)
return self.invchi2(fscore, len(features)*2)
def invchi2(self, chi, df):
m = chi/2.0
sum = term = math.exp(-m)
for i in range(1, df/2):
term *= m/i
sum += term
return min(sum, 1.0)
# In this project, get_words is getfeatures in the classifier
def get_words(txt):
splitter = re.compile('\W')
words = [s.lower() for s in splitter.split(txt) if len(s) > 2 and len(s) < 20]
# Return unique set of words
return dict([(w,1) for w in words])
def get_category(fisher_classifier, categories, feats):
max_prob = 0
fit_category = ''
for cate in categories:
prob = fisher_classifier.fisherprob(feats, cate)
if prob > max_prob:
max_prob = prob
fit_category = cate
return fit_category, max_prob
def main():
trainingdata_file = open('/Users/hanhanwu/Documents/workspace/PythonLearning/Sellers++/training_data','r')
cl1 = classifier(get_words)
cl2 = fisherclassifier(get_words)
for line in trainingdata_file:
elems = line.split('****')
cate = elems[1].split(',')[0]
item = elems[0]
cl1.train(item, cate)
cl2.train(item, cate)
p1 = cl1.weightedprob('hanhan', 'music')
p2 = cl2.cprob('hanhan', 'music')
print p1
print p2
words = ['farook', 'bernardino', 'citigroup', 'funded', 'syed', 'lending', 'made', 'times', 'business', 'rose', 'four', 'hollywood', 'will', 'stocks', 'rates']
cs = cl2.categories()
for c in cs:
print c, ': ', cl2.fisherprob(words, c)
fit_category, max_prob = get_category(cl2, cs, words)
print fit_category
print max_prob
if __name__ == "__main__":
main()