In [33]:
import os
import json

import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy import spatial

import config

In [17]:
r = requests.get(config.LEPUS_URL)

In [18]:
r.status_code

200

In [19]:
r.text

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n<html>\n    <head>\n        <title>Reusable Form Demo</title>\n    </head>\n    <body>\n        \n            \n        \n        <form action="" method="post">\n            \n \n            <div class="input text">\n                <label for="name">Enter your request:</label> <input id="name" name="name" required type="text" value="">\n            </div>\n \n            <div class="input submit">\n                <input type="submit" value="Submit" />\n            </div>\n        </form>\n    </body>\n</html>'

In [20]:
corpus = [
...     'This is the first document.',
...     'This document is the second document.',
...     'And this is the third one.',
...     'Is this the first document?',
... ]

In [21]:
corpus

['This is the first document.',
 'This document is the second document.',
 'And this is the third one.',
 'Is this the first document?']

In [22]:
vectorizer = TfidfVectorizer()

In [23]:
X = vectorizer.fit_transform(corpus)

In [24]:
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [25]:
tfidf = list(np.array(X.todense()))
tfidf = [list(i) for i in tfidf]

In [28]:
query = vectorizer.transform(["document is one"]).todense()
query

matrix([[0.        , 0.49248889, 0.        , 0.40264194, 0.77157901,
         0.        , 0.        , 0.        , 0.        ]])

In [29]:
result = 1 - spatial.distance.cosine(tfidf[0], query)
result

0.3860158614579332

In [30]:
res = cosine_similarity(tfidf, query)
res

array([[0.38601586],
       [0.45182507],
       [0.50247875],
       [0.38601586]])

In [31]:
# tfidf

In [34]:
json.dumps(tfidf)

'[[0.0, 0.46979138557992045, 0.5802858236844359, 0.38408524091481483, 0.0, 0.0, 0.38408524091481483, 0.0, 0.38408524091481483], [0.0, 0.6876235979836938, 0.0, 0.281088674033753, 0.0, 0.5386476208856763, 0.281088674033753, 0.0, 0.281088674033753], [0.511848512707169, 0.0, 0.0, 0.267103787642168, 0.511848512707169, 0.0, 0.267103787642168, 0.511848512707169, 0.267103787642168], [0.0, 0.46979138557992045, 0.5802858236844359, 0.38408524091481483, 0.0, 0.0, 0.38408524091481483, 0.0, 0.38408524091481483]]'

In [35]:
inverted_index = {}
forward_index = {}
documents_id = []

In [36]:
def load_index(path):
    global forward_index
    global inverted_index
    global documents_id

    file_path = os.path.join(path, "forward_index.json")
    with open(file_path, 'r', encoding='utf8') as infile:
        forward_index = json.load(infile)

    file_path = os.path.join(path, "inverted_index.json")
    with open(file_path, 'r', encoding='utf8') as infile:
        inverted_index = json.load(infile)

    file_path = os.path.join(path, "documents_id.json")
    with open(file_path, 'r', encoding='utf8') as infile:
        documents_id = json.load(infile)

In [37]:
load_index(config.data_dir)

In [38]:
len(documents_id)

1001

In [39]:
corpus = []
for i in documents_id:
    corpus.append(forward_index[i]["text_normalized"])

vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(corpus).todense()

In [40]:
tfidf.shape

(1001, 8652)

In [41]:
query = vectorizer.transform(["водител работ"])

In [45]:
res = cosine_similarity(tfidf, query)
np.squeeze(res)

array([0.        , 0.        , 0.04103756, ..., 0.        , 0.0186467 ,
       0.        ])

In [None]:
def compute_tfidf():
    corpus = []
    fo i in documents_id:
        corpus.append(forward_index[i]["text_normilized"])
    
    vectorizer = TfidfVectorizer()
    tfidf = np.vectorizer.fit_transform(corpus).todense()
    tfidf = list(np.array(X.todense()))
    tfidf = [list(i) for i in tfidf]

In [103]:
a = [1, 2, 3]
b = [6, 4, 0]

In [104]:
c = zip(a, b)

In [105]:
c = list(c)

In [106]:
c

[(1, 6), (2, 4), (3, 0)]

In [107]:
 t = sorted(list(c), key=lambda x: x[1])

In [108]:
t

[(3, 0), (2, 4), (1, 6)]

In [109]:
t = list(reversed(t))

In [110]:
t

[(1, 6), (2, 4), (3, 0)]

In [111]:
c

[(1, 6), (2, 4), (3, 0)]