In [17]:
from __future__ import annotations
import pymongo
import numpy as np
import scipy as sp
import scipy.sparse as sparse
from scipy.sparse.linalg import svds
import pandas as pd
from analysis import Searcher

In [2]:
def get_db():
    dbname = "indexyz"
    client = pymongo.MongoClient(f"mongodb://127.0.0.1/{dbname}")
    db = client.indexyz
    return db

In [3]:
db = get_db()

In [4]:
webpages = db.webpages

In [5]:
condition = {
    'terms': {
        '$exists': True, 
        '$not': {'$size': 0}
    }
}

projection = {'_id': 0, 'url': 1, 'terms': 1, 'title': 1, 'description': 1}

cursor1 = webpages.find(
    condition,
    projection, 
    hint='url_1'
)

In [6]:
all_webpages = []
for x in cursor1:
    all_webpages.append(x)

In [7]:
n_webpages = len(all_webpages)

In [8]:
n_webpages

4011

In [9]:
from tqdm import tqdm

In [10]:
vocabulary = dict()
term_indexes = []
indptr = [0]
for webpage in tqdm(all_webpages):
    for term in webpage['terms']:
        if term in vocabulary:
            term_indexes.append(vocabulary[term])
        else:
            term_index = len(vocabulary)
            vocabulary[term] = term_index
            term_indexes.append(term_index)
    indptr.append(len(term_indexes))

100%|██████████| 4011/4011 [00:02<00:00, 1407.94it/s]


In [11]:
data = np.ones(len(term_indexes))

In [12]:
term_document_matrix = sparse.csr_matrix((data, term_indexes, indptr,))

In [13]:
term_document_matrix.shape

(4011, 209944)

In [15]:
k = min(term_document_matrix.shape) - 1
k = 50
u, sigma, vh = svds(term_document_matrix, k=k)

In [16]:
webs = pd.DataFrame(all_webpages)

In [18]:
searcher = Searcher(u, sigma, vh, vocabulary, webs)

In [20]:
# searcher.query('Java')

In [22]:
# svd_filename = 'searcher_svds.npz'
# vocabulary_filename = 'vocabulary.gz'
# web_table_filename = 'web.csv'
# searcher.save(svd_filename, vocabulary_filename, web_table_filename)