In [1]:

import os, sys ; ROOT = os.path.dirname(os.getcwd())
sys.path.insert(0, ROOT) if ROOT not in sys.path else None

In [2]:
# import libraries
from src.modeling import BM25
from src.preprocessing import Preprocessing
from rank_bm25 import BM25Okapi
import pandas as pd

In [3]:
# read CSV file
books = pd.read_pickle(os.path.join(ROOT, 'data', 'prepared_books.pkl'))
books.info()
books.head(2)

<class 'pandas.core.frame.DataFrame'>
Index: 79436 entries, 0 to 271369
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   isbn                   79436 non-null  object
 1   title                  79436 non-null  object
 2   author                 79436 non-null  object
 3   year_of_publication    79436 non-null  object
 4   publisher              79436 non-null  object
 5   description            79436 non-null  object
 6   title_processed        79436 non-null  object
 7   description_processed  79436 non-null  object
dtypes: object(8)
memory usage: 5.5+ MB


Unnamed: 0,isbn,title,author,year_of_publication,publisher,description,title_processed,description_processed
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"Featuring the authors' extensive, clear, and f...","[classical, mythology]","[featuring, the, authors, extensive, clear, an..."
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"It is the year 1934, and in a small town in Ca...","[clara, callan]","[it, is, the, year, 1934, and, in, a, small, t..."


In [4]:
# initialize BM25 with tokenized documents
bm25_title = BM25Okapi(books['title_processed'].tolist())
bm25_description = BM25Okapi(books['description_processed'].tolist())

In [5]:
# compute BM25 scores for the query
query = 'harry potter prisoner azkaba'
tokenized_query = Preprocessing().fit_transform(query)

print(tokenized_query)
scores_title = bm25_title.get_scores(tokenized_query)
scores_description = bm25_description.get_scores(tokenized_query)

# define weights
weight_title = 0.8
weight_description = 0.2

# combine BM25 scores with feature weights
books['bm25_score'] = (weight_title * scores_title) + (weight_description * scores_description)

# sort by BM25 score
df_sorted = books.sort_values(by='bm25_score', ascending=False)

# display results
df_sorted[['title', 'description', 'bm25_score']].head(10)

['harry', 'potter', 'prisoner', 'azkaba']


Unnamed: 0,title,description,bm25_score
20166,Harry Potter and the Prisoner of Azkaban Color...,"Three times the scares, three times the tears,...",19.182695
6330,Harry Potter and the Prisoner of Azkaban (Book 3),"For twelve long years, the dread fortress of A...",17.136954
3839,Harry Potter and the Prisoner of Azkaban (Book 3),"For twelve long years, the dread fortress of A...",16.714728
52580,We Love Harry Potter!,Harry Potter-by now the name is surely as magi...,16.326613
34082,Harry Potter and the Prisoner of Azkaban (Book...,"Running time: 11 hrs., 48 mins.\n\nFor twelve ...",16.246581
77384,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter has no idea how famous he is. Tha...,16.122784
2143,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter has no idea how famous he is. Tha...,16.122784
28425,Harry Potter and the Prisoner of Azkaban (Book...,"Read by Jim Dale\nRunning time: 11 hrs., 48 mi...",15.449513
34540,Harry Potter Und Der Feuerkelch,The fourth book in the Harry Potter series tra...,15.396376
41882,Harry Potter Collector's Value Guide,Lists prices and values of Harry Potter mercha...,15.165872


In [6]:
# testing the BM25 class that utilizes BM25Okapi and multi-feature weightings
bm25 = BM25(
    corpus_title=books['title_processed'].tolist(), 
    corpus_description=books['description_processed'].tolist(),
    weights=[0.8, 0.2]
)

In [7]:
# get scores #1
books['bm25_score_2'] = bm25.similarity(Preprocessing().fit_transform('we love harry potter'), True)
books_scores = books.sort_values(by='bm25_score_2', ascending=False)
books_scores.head(5)

Unnamed: 0,isbn,title,author,year_of_publication,publisher,description,title_processed,description_processed,bm25_score,bm25_score_2
52580,031226481X,We Love Harry Potter!,Sharon Moore,1999,St. Martin's Press,Harry Potter-by now the name is surely as magi...,"[we, love, harry, potter]","[harry, potterby, now, the, name, is, surely, ...",16.326613,0.998417
2143,059035342X,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,1999,Arthur A. Levine Books,Harry Potter has no idea how famous he is. Tha...,"[harry, potter, and, the, sorcerers, stone, ha...","[harry, potter, has, no, idea, how, famous, he...",16.122784,0.610303
77384,059035342x,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,1999,Arthur A. Levine Books,Harry Potter has no idea how famous he is. Tha...,"[harry, potter, and, the, sorcerers, stone, ha...","[harry, potter, has, no, idea, how, famous, he...",16.122784,0.610303
34540,3551551936,Harry Potter Und Der Feuerkelch,Joanne K. Rowling,1999,Carlsen Verlag GmbH,The fourth book in the Harry Potter series tra...,"[harry, potter, und, der, feuerkelch]","[the, fourth, book, in, the, harry, potter, se...",15.396376,0.58968
41882,1585980730,Harry Potter Collector's Value Guide,CheckerBee Publishing,2000,CheckerBee Publishing,Lists prices and values of Harry Potter mercha...,"[harry, potter, collectors, value, guide]","[lists, prices, and, values, of, harry, potter...",15.165872,0.579666


In [8]:
# get scores #2
books['bm25_score_2'] = bm25.similarity(Preprocessing().fit_transform('harry potter collectors value guide'), True)
books_scores = books.sort_values(by='bm25_score_2', ascending=False)
books_scores.head(5)

Unnamed: 0,isbn,title,author,year_of_publication,publisher,description,title_processed,description_processed,bm25_score,bm25_score_2
41882,1585980730,Harry Potter Collector's Value Guide,CheckerBee Publishing,2000,CheckerBee Publishing,Lists prices and values of Harry Potter mercha...,"[harry, potter, collectors, value, guide]","[lists, prices, and, values, of, harry, potter...",15.165872,0.954703
47847,1888914777,Charming Tails 2000 Collector's Value Guide,CheckerBee Publishing,2000,CheckerBee Publishing,Fans of the Charming Tails line can easily kee...,"[charming, tails, 2000, collectors, value, guide]","[fans, of, the, charming, tails, line, can, ea...",0.0,0.624597
76486,188891419X,Beanie Babies Spring 1998 Collector's Value Guide,Collectors Publishing Co,1998,The College of Estate Management,Tys Beanie Babies have captured the hearts of ...,"[beanie, babies, spring, 1998, collectors, val...","[tys, beanie, babies, have, captured, the, hea...",0.0,0.558359
52580,031226481X,We Love Harry Potter!,Sharon Moore,1999,St. Martin's Press,Harry Potter-by now the name is surely as magi...,"[we, love, harry, potter]","[harry, potterby, now, the, name, is, surely, ...",16.326613,0.522343
2143,059035342X,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,1999,Arthur A. Levine Books,Harry Potter has no idea how famous he is. Tha...,"[harry, potter, and, the, sorcerers, stone, ha...","[harry, potter, has, no, idea, how, famous, he...",16.122784,0.50531


In [9]:
# get scores #3
books['bm25_score_2'] = bm25.similarity(Preprocessing().fit_transform('tet'), True)
books_scores = books.sort_values(by='bm25_score_2', ascending=False)
books_scores.head(5)

Unnamed: 0,isbn,title,author,year_of_publication,publisher,description,title_processed,description_processed,bm25_score,bm25_score_2
28599,002930380X,AFTER TET : THE BLOODIEST YEAR IN VIETNAM,Ronald H. Spector,1992,Free Press,Recounts the experience of American soldiers i...,"[after, tet, the, bloodiest, year, in, vietnam]","[recounts, the, experience, of, american, sold...",0.0,0.8
24687,067156787X,A TIME OF WAR : A TIME OF WAR,Michael Peterson,1996,Pocket,"In the months before the Tet Offensive, season...","[a, time, of, war, a, time, of, war]","[in, the, months, before, the, tet, offensive,...",0.0,0.2
70368,0679760237,In Pharaoh's Army : Memories of the Lost War,TOBIAS WOLFF,1995,Vintage,Whether he is evoking the blind carnage of the...,"[in, pharaohs, army, memories, of, the, lost, ...","[whether, he, is, evoking, the, blind, carnage...",0.0,0.176574
38660,0142003395,The Silent Men,Richard H. Dickinson,2003,Penguin Books,Jackson Monroe is the finest American sniper i...,"[the, silent, men]","[jackson, monroe, is, the, finest, american, s...",0.0,0.123721
23283,0312876610,Vivienne,Richard Hoyt,2000,Forge,"It is 1968, the end of Tet, the Chinese New Ye...",[vivienne],"[it, is, 1968, the, end, of, tet, the, chinese...",0.0,0.098514


In [10]:
# get scores #4
books['bm25_score_2'] = bm25.similarity(Preprocessing().fit_transform('gg'), True)
books_scores = books.sort_values(by='bm25_score_2', ascending=False)
books_scores.head(10)

Unnamed: 0,isbn,title,author,year_of_publication,publisher,description,title_processed,description_processed,bm25_score,bm25_score_2
12547,671628410,Why Me?,Ellen Conford,1987,Pocket Books,Talk about girl trouble!\nSince G.G. Graffman ...,"[why, me]","[talk, about, girl, trouble, since, gg, graffm...",0.0,0.2
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"Featuring the authors' extensive, clear, and f...","[classical, mythology]","[featuring, the, authors, extensive, clear, an...",0.0,0.0
52992,758204531,Standing in the Shadows,Shannon McKenna,2003,BRAVA,Ex-FBI agent Connor McCloud must gain the trus...,"[standing, in, the, shadows]","[exfbi, agent, connor, mccloud, must, gain, th...",0.0,0.0
52991,1843605546,A Mutual Favor,Ann Jacobs,2002,Ellora's Cave,"Two good friends, two thorny problems. Kurt ne...","[a, mutual, favor]","[two, good, friends, two, thorny, problems, ku...",0.0,0.0
52990,312987293,You've Got a Hold On Me,Tamara Sneed,2004,St. Martin's Paperbacks,It takes alot for Assistant D.A. Amelia Farrow...,"[youve, got, a, hold, on, me]","[it, takes, alot, for, assistant, da, amelia, ...",0.0,0.0
52989,312989970,The Midnight Hour (A Madaris Family Novel),Brenda Jackson,2004,St. Martin's Paperbacks,"Sexy, dangerous, unforgettable. . .the Madaris...","[the, midnight, hour, a, madaris, family, novel]","[sexy, dangerous, unforgettable, the, madaris,...",0.0,0.0
52988,743457447,Hard Lovin' Man,Lorraine Heath,2003,Pocket Star,Ten years after leaving her former lover Jack ...,"[hard, lovin, man]","[ten, years, after, leaving, her, former, love...",0.0,0.0
52987,441081223,Gnome Man's Land,Esther Friesner,1991,Ace Books,When a hole in the fabric of the universe rips...,"[gnome, mans, land]","[when, a, hole, in, the, fabric, of, the, univ...",0.0,0.0
52986,385510500,Faithful : A Novel,DAVITT SIGERSON,2004,Nan A. Talese,"Risky, fiercely erotic and deeply touching, Da...","[faithful, a, novel]","[risky, fiercely, erotic, and, deeply, touchin...",0.0,0.0
52985,451408136,Diamond Rain,Constance Laux,1999,Topaz,The glitter and pomp of Queen Victoria's Diamo...,"[diamond, rain]","[the, glitter, and, pomp, of, queen, victorias...",0.0,0.0
