In [15]:
import json, re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer

In [16]:
LOGIC = {
    "AND": "&", "and": "&",
    "OR": "|", "or": "|",
    "NOT": "1 -", "not": "1 -",
    "(": "(", ")": ")",
    "@1@" : "1", "@True@": "1"
}

FIELDS = ["name", "genres", "developers", "publishers", "description_raw"]

stemmer = PorterStemmer()

In [17]:
def stem_tokenizer(text):
    tokens = re.findall(r"\b\w+\b", text.lower())
    return [stemmer.stem(t) for t in tokens]

def rewrite_token(token):

    #boolean search with logic connectors
    if token in LOGIC:
        return LOGIC[token]

    #query with a field
    if ":" in token:
        field, raw_term = token.split(":", 1)

        if field not in FIELDS:
            return "0"

        is_exact = raw_term.startswith('"') and raw_term.endswith('"')
        term = raw_term.replace('"', '').lower()

        #exact multi-words query
        if is_exact and " " in term:
            results = []
            for game_id in doc_ids:
                text = str(games[game_id].get(field, "")).lower()
                results.append(1 if term in text else 0)
            return f"np.array([{results}])"

        #unique word query with a field
        if not is_exact:
            term = stemmer.stem(term)
            index_type = "stem"
        else:
            index_type = "exact"

        if term not in vocabularies[index_type][field]:
            return "0"

        return (
            f"td_matrices['{index_type}']['{field}']"
            f"[vocabularies['{index_type}']['{field}']['{term}']].todense()"
        )

    #query without a field
    is_exact = token.startswith('"') and token.endswith('"')
    term = token.replace('"', '').lower()

    #exact multi-words query without a field
    if is_exact and " " in term:
        results = []
        for game_id in doc_ids:
            full_text = " ".join(
                str(games[game_id].get(f, "")) for f in FIELDS
            ).lower()
            results.append(1 if term in full_text else 0)
        return f"np.array([{results}])"

    #unique word query without a field
    if not is_exact:
        term = stemmer.stem(term)
        index_type = "stem"
    else:
        index_type = "exact"

    expressions = []

    for f in FIELDS:
        if term in vocabularies[index_type][f]:
            expressions.append(
                f"td_matrices['{index_type}']['{f}']"
                f"[vocabularies['{index_type}']['{f}']['{term}']].todense()"
            )

    if not expressions:
        return "0"

    return "(" + " | ".join(expressions) + ")"

In [18]:
def rewrite_query(query):
    tokens = re.findall(r'(?:\w+:)?"[^"]*"|\S+', query)
    return " ".join(rewrite_token(t) for t in tokens)

def query(q, k=10):
    hits_matrix = eval(rewrite_query(q))
    hits = hits_matrix.nonzero()[1][:k]

    for rank, i in enumerate(hits, start=1):
        game_id = doc_ids[i]
        print(f"{rank}. {games[game_id]['name']} (ID: {game_id})")

In [19]:
with open("db.json", encoding="utf-8") as f:
    data = json.load(f)

games = data["games"]

field_docs = {f: [] for f in FIELDS}
doc_ids = []

for game_id, game in games.items():
    doc_ids.append(game_id)
    for f in FIELDS:
        field_docs[f].append(str(game.get(f, "")))

In [20]:
vectorizers = {"stem": {}, "exact": {}}
td_matrices = {"stem": {}, "exact": {}}
vocabularies = {"stem": {}, "exact": {}}

for f in FIELDS:

    # index stemmed
    cv_stem = CountVectorizer(
        tokenizer=stem_tokenizer,
        binary=True
    )
    Xs = cv_stem.fit_transform(field_docs[f])

    vectorizers["stem"][f] = cv_stem
    td_matrices["stem"][f] = Xs.T.tocsr()
    vocabularies["stem"][f] = cv_stem.vocabulary_

    # index exact
    cv_exact = CountVectorizer(
        lowercase=True,
        binary=True,
        token_pattern=r"\b\w+\b"
    )
    Xe = cv_exact.fit_transform(field_docs[f])

    vectorizers["exact"][f] = cv_exact
    td_matrices["exact"][f] = Xe.T.tocsr()
    vocabularies["exact"][f] = cv_exact.vocabulary_

In [25]:
q1 = 'name:"the legend of zelda"'
q2 = '"young boy"'

query(q1)
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
query(q2)

1. The Legend of Zelda: Ocarina of Time (ID: 25097)
2. The Legend of Zelda: Breath of the Wild (ID: 22511)
3. The Legend of Zelda: Tears of the Kingdom (ID: 327239)
4. The Legend of Zelda: Collector's Edition (ID: 364800)
5. The Legend of Zelda: Majora's Mask (ID: 25924)
6. The Legend of Zelda: A Link to the Past (ID: 25096)
7. The Legend of Zelda: Ocarina of Time 3D (ID: 27969)
8. The Legend of Zelda: Skyward Sword (ID: 26824)
9. The Legend of Zelda: A Link Between Worlds (ID: 27977)
10. The Legend of Zelda: The Wind Waker HD (ID: 27975)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. The Legend of Zelda: Ocarina of Time (ID: 25097)
2. The Legend of Zelda: A Link to the Past (ID: 25096)
3. ICO (ID: 4490)
4. Grandia (ID: 5075)
5. Shin Megami Tensei: Persona 3 (ID: 40002)
6. Ni no Kuni: Wrath of the White Witch (ID: 4128)
7. The Unfinished Swan (ID: 3533)
8. Little Nightmares (ID: 41)
9. Lunar: The Silver Star (ID: 54275)
10. Puppeteer (ID: 3911)
