-
Notifications
You must be signed in to change notification settings - Fork 0
/
server.py
140 lines (120 loc) · 5.52 KB
/
server.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from flask import Flask,render_template,url_for,request
import unicodedata
app=Flask(__name__,static_url_path="")
app.Debug=True
_WORD_MIN_LENGTH = 3
_STOP_WORDS = frozenset([
'a', 'about', 'above', 'above', 'across', 'after', 'afterwards', 'again',
'against', 'all', 'almost', 'alone', 'along', 'already', 'also','although',
'always','am','among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another',
'any','anyhow','anyone','anything','anyway', 'anywhere', 'are', 'around', 'as',
'at', 'back','be','became', 'because','become','becomes', 'becoming', 'been',
'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides',
'between', 'beyond', 'bill', 'both', 'bottom','but', 'by', 'call', 'can',
'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de', 'describe',
'detail', 'do', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight',
'either', 'eleven','else', 'elsewhere', 'empty', 'enough', 'etc', 'even',
'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few',
'fifteen', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former',
'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get',
'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her', 'here',
'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him',
'himself', 'his', 'how', 'however', 'hundred', 'ie', 'if', 'in', 'inc',
'indeed', 'interest', 'into', 'is', 'it', 'its', 'itself', 'keep', 'last',
'latter', 'latterly', 'least', 'less', 'ltd', 'made', 'many', 'may', 'me',
'meanwhile', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly',
'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'neither', 'never',
'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not',
'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', 'one', 'only',
'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out',
'over', 'own','part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 'same',
'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'she',
'should', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some',
'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere',
'still', 'such', 'system', 'take', 'ten', 'than', 'that', 'the', 'their',
'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby',
'therefore', 'therein', 'thereupon', 'these', 'they', 'thickv', 'thin', 'third',
'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus',
'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two',
'un', 'under', 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well',
'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter',
'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which',
'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will',
'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself',
'yourselves', 'the'])
@app.route('/')
def index():
return render_template('index.html')
@app.route('/generate_inverted_index/',methods=['POST'])
def hello():
doc1=request.form['document1']
doc2=request.form['document2']
query=request.form['query']
inverted = {}
documents = {'doc1':doc1, 'doc2':doc2}
ans=[]
for doc_id, text in documents.iteritems():
doc_index = inverted_index(text)
inverted_index_add(inverted, doc_id, doc_index)
result_docs = search(inverted, query)
for _, word in word_index(query):
def extract_text(doc, index):
return documents[doc][index:index+20].replace('\n', ' ')
for doc in result_docs:
for index in inverted[word][doc]:
ans.append(extract_text(doc, index))
print 'Jayanth'
for l in ans :
print l
return render_template('lol.html',**locals())
def word_split(text):
word_list = []
wcurrent = []
windex = None
for i, c in enumerat(text):
if c.isalnum():
wcurrent.append(c)
windex = i
elif wcurrent:
word = u''.join(wcurrent)
word_list.append((windex - len(word) + 1, word))
wcurrent = []
if wcurrent:
word = u''.join(wcurrent)
word_list.append((windex - len(word) + 1, word))
return word_list
def words_cleanup(words):
cleaned_words = []
for index, word in words:
if len(word) < _WORD_MIN_LENGTH or word in _STOP_WORDS:
continue
cleaned_words.append((index, word))
return cleaned_words
def words_normalize(words):
normalized_words = []
for index, word in words:
wnormalized = word.lower()
normalized_words.append((index, wnormalized))
return normalized_words
def word_index(text):
words = word_split(text)
words = words_normalize(words)
words = words_cleanup(words)
return words
def inverted_index(text):
inverted = {}
for index, word in word_index(text):
locations = inverted.setdefault(word, [])
locations.append(index)
return inverted
def inverted_index_add(inverted, doc_id, doc_index):
for word, locations in doc_index.iteritems():
indices = inverted.setdefault(word, {})
indices[doc_id] = locations
return inverted
def search(inverted, query):
words = [word for _, word in word_index(query) if word in inverted]
results = [set(inverted[word].keys()) for word in words]
return reduce(lambda x, y: x & y, results) if results else []
if __name__ == '__main__':
app.run(host='0.0.0.0',port=12345)