In [None]:
import pandas as pd
import pickle
from scipy.sparse import save_npz, load_npz
import numpy as np
import sqlite3
import gc

### Vectorize Wikipedia articles and save their corresponding ID's

In [1]:
df = pd.read_csv("datasets_lisandro/wikipedia.csv")
tfidf_vectorizer = pickle.load(open("models_lisandro/2gram_tfidf.pkl", "rb"))

docs_sparse = tfidf_vectorizer.transform(df["text"])

save_npz("datasets_lisandro/docs_sparse.npz", docs_sparse)

ids = df["id"].values

np.save("datasets_lisandro/wikipedia_ids.npy", ids)

### Get 5 most similar documents to a query

In [2]:
docs_sparse = load_npz('datasets_lisandro/docs_sparse.npz')
tfidf_vectorizer = pickle.load(open("models_lisandro/2gram_tfidf.pkl", "rb"))
ids = np.load("datasets_lisandro/wikipedia_ids.npy", allow_pickle=True)

db_path = "data/wikipedia/docs.db"
connection = sqlite3.connect(db_path, check_same_thread=False)
cursor = connection.cursor()

def get_5_most_similar_documents(query):
    query_sparse = tfidf_vectorizer.transform([query])

    scores = (docs_sparse * query_sparse.transpose()).toarray()

    best_indices = np.argpartition(-scores.ravel(),5)[:5]

    selected_ids = ids[best_indices]

    cursor.execute("SELECT id, text FROM documents WHERE id IN " + str(tuple(selected_ids.tolist())))
    data_json = {"id": [], "text": []}
    for r in cursor.fetchall():
        data_json["id"].append(r[0]); data_json["text"].append(r[1])

    data_df = pd.DataFrame.from_dict(data_json)

    del data_json
    gc.collect()

    return data_df

query = "How long do Hamsters live?"
display(get_5_most_similar_documents(query).head(5))

cursor.close()
connection.close()

Unnamed: 0,id,text
0,Animal testing on Syrian hamsters,Animal testing on Syrian hamsters\n\nSyrian ha...
1,Domestication of the Syrian hamster,Domestication of the Syrian hamster\n\nThe dom...
2,Golden hamster,"Golden hamster\n\nThe golden hamster, or Syria..."
3,Hamster,Hamster\n\nHamsters are rodents belonging to t...
4,The Hamsters (album),The Hamsters (album)\n\nThe Hamsters (1993) (k...
