-
Notifications
You must be signed in to change notification settings - Fork 0
/
keywords_extractor.py
136 lines (111 loc) · 3.92 KB
/
keywords_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
__author__="ilya"
__date__ ="$01.11.2012 03:08:31$"
import sys
import nltk
keyword_candidates = []
cooccurrence_matrix = []
words = dict()
def make_keyword_candidates(text):
global keyword_candidates
global cooccurrence_matrix
global words
keyword_candidates = []
cooccurrence_matrix = []
words = dict()
pattern = "NP:{<VB|VBN|JJ|JJR>*<NN|NNS|NNP>*}"
NPChunker = nltk.RegexpParser(pattern)
sentences = text.split('. ')
for sentence in sentences :
tokens = nltk.word_tokenize(sentence)
tags = nltk.pos_tag(tokens)
result = NPChunker. parse(tags)
#print result
for n in result:
if isinstance(n, nltk.tree.Tree):
if n.node == 'NP':
kcand = ""
for item in n.leaves() :
kcand+= item[0] + ' ' if len(item[0]) >= 3 else ''
#print kcand
if len(kcand) > 3 :
keyword_candidates.append(kcand.strip())
def make_cooccurrence_matrix():
global cooccurrence_matrix
global keyword_candidates
#print keyword_candidates
global words
for item in keyword_candidates :
ii = item.split(' ')
for i in ii :
words[i] = 1
for i in xrange(len(words)):
cooccurrence_matrix.append([])
for j in xrange(len(words)):
cooccurrence_matrix[i].append(0)
wkeys = list(words)
for i in xrange(len(words)) : #Row
for z in xrange(len(keyword_candidates)) :
if wkeys[i] in [kc.strip() for kc in keyword_candidates[z].split(' ') ] :
for j in xrange(len(words)) : # Column
if wkeys[j] in [kc.strip() for kc in keyword_candidates[z].split(' ') ] :
cooccurrence_matrix[i][j]+=1
#print_cooccurrence_matrix()
def make_keywords_list():
global cooccurrence_matrix
global keyword_candidates
global words
word_freqs = dict()
word_deg = dict()
score_array = dict()
keywords = dict()
wkeys = list(words)
for i in xrange(len(cooccurrence_matrix)) :
deg = 0.0
for j in xrange(len(cooccurrence_matrix)) :
deg += cooccurrence_matrix[i][j]
if i == j :
word_freqs[wkeys[i]] = cooccurrence_matrix[i][j]
word_deg[wkeys[i]] = deg
for i in xrange(len(cooccurrence_matrix)) :
score_array[wkeys[i]] = word_deg[wkeys[i]]/word_freqs[wkeys[i]]
#print word_freqs
#print word_deg
#print sorted(score_array.items(), key=lambda x: x[1], reverse = True)
for i in xrange(len(keyword_candidates)) :
keywords[keyword_candidates[i]] = 0
for i in xrange(len(score_array)) :
for j in xrange(len(keyword_candidates)) :
if keyword_candidates[j].rfind(wkeys[i]) != -1 :
keywords[keyword_candidates[j]] += score_array[wkeys[i]]
#print score
#print sorted(keywords.items(), key=lambda x: x[1], reverse = True)
keywords_keys = []
#print "The list of key words:"
i = 0
l = len(keywords)
for item in list( sorted(keywords.items(), key=lambda x: x[1], reverse = True)) :
keywords_keys.append(item[0])
i+=1
if i > l/3 +1 : break
return keywords_keys
def print_cooccurrence_matrix() :
#sss = ""
#for i in xrange(n) :
# sss+= bacteria_lst[i].bac_name.rjust(15)
#print sss.rjust(70) + "\n"
global cooccurrence_matrix
print cooccurrence_matrix
"""global cooccurrence_matrix
row=0
col=0
while row<len(cooccurrence_matrix)-1:
col=0
tstr = ""
while col<len(cooccurrence_matrix)-1:
tstr+= str(cooccurrence_matrix[row][col]).rjust(15)
#col=col+1
#print col
#print bacteria_lst[row].bac_name.ljust(15-len(bacteria_lst[row].bac_name))+tstr + "\n"
print tstr
row=row+1"""