-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
executable file
·151 lines (111 loc) · 4.63 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import json
import sys
from bs4 import BeautifulSoup
import re
import pymongo
import math
from pymongo import MongoClient
index = {}
count = 0 #debug
def readingJson():
json_file = open('../WEBPAGES_RAW/bookkeeping.json')
json_str = json_file.read()
json_data = json.loads(json_str)
json_file.close()
return json_data
def tokenizefile(filepath):
'''Read a file with BS, and exclude scripts from being read'''
soup = BeautifulSoup(open('../WEBPAGES_RAW/'+filepath),'html.parser')
for script in soup(['style', 'script']): # remove all script and style from being parse
script.extract()
return readwords(soup.get_text())
def readwords(words):
'''empty dictionary to store word occurences'''
dictionary = dict()
# this processedline removes unwanted symbols
processedline = re.sub('[^a-zA-Z0-9 \n\r]', ' ', words).split()
for word in (processedline):
word = word.lower()
if dictionary.has_key(word):
# this adds onto the existing key satisfy by the condition statement above
dictionary[word] += 1
else:
# this reads adds a new key into the dictionary and setting it at 1
dictionary.update({word:1})
return dictionary
def updateMongo():
''' create an inverted index for each word (refer to the paper)'''
json_data = readingJson() # loads in all file path from book keep from json
global index
global count
for key in json_data.keys(): # for each key = directory
# print "Processing file path", key
count += 1 # increment document counter
wordDict = (tokenizefile(key)) # dictionaries of words : frequencies
# if index.has_key():
for word in wordDict: # tokenize dictionary for the supplied file path for key
if index.has_key(word):
# if not in dict, create sub dict and update mongodo with sub dict
index[word].update({str(key):wordDict[word]})
else:
index[word] = {str(key):wordDict[word]}
# if count == 5:
# return (index)
return (index)
def pushtoMongo(file):
# create connection to db
client = MongoClient('mongodb://localhost:27017')
db = client.project3v2
posts = db.posts
# for key in file.keys():
#
# # we can find idf here by getting the length of each keys, take log of that
# # then find tf inside of the inner for loop
#
#
# for item in file[key]: # this is another dictionary
#
# # inside here, tf is file[key][item]
# # multiply that by the idf found in the outside loop
# # the result will be tf-idf to be stored
#
# post_id = posts.insert({ key :"term" , # this is the term
# 'docID': item, # directory of the term
# 'tf': file[key][item], # instead of storing tf, store tf-idf just calculated
# } )
for term in file.keys():
# we can find idf here by getting the length of each keys, take log of that
# then find tf inside of the inner for loop
tobeLogged = (float(count)) / len(file[term])
# print tobeLogged, count, len(file[term]), math.log(count / len(file[term]))
idf = math.log( tobeLogged ) # can be different base
for docID in file[term]: # this is another dictionary
tf = 1 + float(math.log(file[term][docID]))
rating = float(idf * tf)
# inside here, tf is file[key][item]
# multiply that by the idf found in the outside loop
# the result will be tf-idf to be stored
#
post_id = posts.insert({ term :"term" , # this is the term
'docID': docID, # directory of the term
'tf': tf, # instead of storing tf, store tf-idf just calculated
'idf': idf,
'rating': rating, # rating is tf-idf
} )
# print "term", term
# print "docID", docID
# print "tf", tf
# print "idf", idf
# print "rating", rating
# print ""
print "Starting program to parse files and create database..."
post = updateMongo()
print ""
print ("Documents Read: " , count)
print ""
print ("Unique Words:" , len (index) )
print ("Unique Words:" , len (post) )
print ""
print "Pushing to the dbs now!!"
pushtoMongo(post)
print "Done pushing!! Data will all be in local dbs"