In [1]:
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.index import create_in
from whoosh.analysis import Composable, Filter, StopFilter, RegexTokenizer, LowercaseFilter
from whoosh.analysis.acore import Token
from nltk.stem import WordNetLemmatizer
from copy import deepcopy
import re
from tqdm import tqdm

In [3]:
class LemmaFilter(Filter):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()

    def __call__(self, tokens):
        for token in tokens:
            lemma = self.lemmatizer.lemmatize(token.text)
            lemma_token = deepcopy(token)
            lemma_token.text = lemma
            yield token
            yield lemma_token

In [4]:
text_analyzer = RegexTokenizer() | LemmaFilter() | LowercaseFilter() | StopFilter()
general_analyzer = RegexTokenizer() | LowercaseFilter()

In [5]:
schema = Schema(id=TEXT(stored=True),
               album=TEXT(field_boost=1.5, analyzer=general_analyzer),
               artist=TEXT(field_boost=2.0, analyzer=general_analyzer),
               text=TEXT(analyzer=text_analyzer),
               source=TEXT(analyzer=general_analyzer))

In [7]:
index = create_in("./Index", schema)
writer = index.writer()

In [8]:
import sqlite3

In [9]:
connection = sqlite3.connect("./reviews_crawler/reviews.db")
cursor = connection.cursor()

In [10]:
cursor.execute("SELECT _id, album, artist, source FROM reviews")
result = cursor.fetchall()

In [11]:
for item in tqdm(result):
    _id = item[0]
    album = item[1]
    if item[1] is None:
        album = 'Unknown'
    artist = item[2]
    if item[2] is None:
        artist = 'Various Artists'
    source = item[3]
    with open("./reviews_crawler/review_texts/" + item[0]) as out:
        text = out.readline()
    writer.add_document(id=_id, album=album, artist=artist, source=source, text=text)


100%|██████████| 20943/20943 [07:48<00:00, 44.73it/s]


In [12]:
writer.commit()