In [1]:
import sqlite3
import pandas as pd
import psycopg2
import configparser
import numpy as np
import os
import re
import time
import nmslib
from scipy.sparse import csr_matrix # may not be required 
from scipy.sparse import rand # may not be required
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
try:
    sqliteConnection = sqlite3.connect('DEI Database.db')
    cursor = sqliteConnection.cursor()
    print("Connected to SQLite")
    
    sql_query = """SELECT Genre,COALESCE(Author,'') AS Author,Title,Year,COALESCE(Author,'')||' '||Title AS match_point
                FROM title_list
                """
    cursor.execute(sql_query)
    sqliteConnection.commit()
    comp_rows = cursor.fetchall()
    cursor.close()
except sqlite3.Error as error:
    print("Failed to run query", error)   

Connected to SQLite


In [3]:
column_names = ["Genre","Author","Title","Year","MatchPoint"]
comp_df = pd.DataFrame(comp_rows, columns=column_names)
comp_df

Unnamed: 0,Genre,Author,Title,Year,MatchPoint
0,Romance/Erotic Romance,Charlie Adhara,The Wolf at the Door,2018.0,Charlie Adhara The Wolf at the Door
1,Romance/Erotic Romance,Charlie Adhara,The Wolf at Bay,2018.0,Charlie Adhara The Wolf at Bay
2,Romance/Erotic Romance,Charlie Adhara,Thrown to the Wolves,2019.0,Charlie Adhara Thrown to the Wolves
3,Romance/Erotic Romance,Charlie Adhara,Wolf in Sheep’s Clothing,2020.0,Charlie Adhara Wolf in Sheep’s Clothing
4,Romance/Erotic Romance,Brea Alepoú,His Bewildered Mate,2019.0,Brea Alepoú His Bewildered Mate
...,...,...,...,...,...
4011,Fiction Anthologies,,Shades Of Black: Crime And Mystery Stories By ...,2004.0,Shades Of Black: Crime And Mystery Stories By...
4012,Fiction Anthologies,,Slay: Stories of the Vampire Noire,2020.0,Slay: Stories of the Vampire Noire
4013,Fiction Anthologies,,Transcendent 3: The Year’s Best Transgender Sp...,2018.0,Transcendent 3: The Year’s Best Transgender S...
4014,Fiction Anthologies,,Transcendent 4,2019.0,Transcendent 4


In [28]:
config = configparser.ConfigParser()
config.read('Y:\\SQL Reports\\creds\\app_SIC.ini')

try:
    query = """SELECT
b.best_title AS title,
COALESCE(REPLACE(SPLIT_PART(SPLIT_PART(b.best_author,' (',1),', ',2),'.','')||' '||SPLIT_PART(b.best_author,', ',1),'') AS author,
b.publish_year,
STRING_AGG(DISTINCT i.location_code,',') AS location,
COALESCE(REPLACE(SPLIT_PART(SPLIT_PART(b.best_author,' (',1),', ',2),'.','')||' '||SPLIT_PART(b.best_author,', ',1),'')||' '||b.best_title AS match_point

FROM
sierra_view.bib_record_property b
JOIN
sierra_view.bib_record_item_record_link l
ON
b.bib_record_id = l.bib_record_id
JOIN
sierra_view.item_record i
ON
l.item_record_id = i.id
AND SUBSTRING(i.location_code,4,1) NOT IN ('y','j')
AND i.location_code ~ '^lin'
JOIN
sierra_view.phrase_entry d
ON
l.bib_record_id = d.record_id AND d.index_tag = 'd' AND d.index_entry ~ 'fiction'
WHERE
b.material_code = 'a'
AND b.publish_year >= 2018

GROUP BY 1,2,3
ORDER BY 1,2
    """
    #variable connection string should be defined in the imported config file
    conn = psycopg2.connect( config['db']['connection_string'] )
except:
    print("unable to connect to the database")
    clear_connection()
cursor = conn.cursor()
cursor.execute(query)
#For now, just storing the data in a variable. We'll use it later.
local_rows = cursor.fetchall()
conn.close()

In [29]:
column_names_local = ["Title", "Author", "Year", "Location", "MatchPoint"]
local_df = pd.DataFrame(local_rows, columns=column_names_local)
local_df

Unnamed: 0,Title,Author,Year,Location,MatchPoint
0,10 minutes 38 seconds in this strange world,Elif Shafak,2019,lina,Elif Shafak 10 minutes 38 seconds in this stra...
1,2034 : a novel of the next world war,Elliot Ackerman,2021,lina,Elliot Ackerman 2034 : a novel of the next wor...
2,28 summers : a novel,Elin Hilderbrand,2020,lina,Elin Hilderbrand 28 summers : a novel
3,29 seconds,T M Logan,2019,lina,T M Logan 29 seconds
4,48 hours,William R Forstchen,2019,lina,William R Forstchen 48 hours
...,...,...,...,...,...
2715,Your house will pay : a novel,Steph Cha,2019,lina,Steph Cha Your house will pay : a novel
2716,Yours cheerfully : a novel,A J Pearce,2021,linan,A J Pearce Yours cheerfully : a novel
2717,Zed : a novel,Joanna Kavenna,2020,lina,Joanna Kavenna Zed : a novel
2718,Zero sum game,S L Huang,2018,lina,S L Huang Zero sum game


In [31]:
local_df[local_df['MatchPoint'].str.match('Daniel José')]

Unnamed: 0,Title,Author,Year,Location,MatchPoint
1678,The book of lost saints,Daniel José Older,2019,lina,Daniel José Older The book of lost saints


In [32]:
def ngrams(string, n=3):
    """Takes an input string, cleans it and converts to ngrams. 
    This script is focussed on cleaning UK company names but can be made generic by removing lines below"""
    string = str(string)
    string = string.lower() # lower case
    #string = fix_text(string) # fix text
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    chars_to_remove = [")","(",".","|","[","]","{","}","'","-"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']' #remove punc, brackets etc...
    string = re.sub(rx, '', string)
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]


###FIRST TIME RUN - takes about 5 minutes... used to build the matching table

##### Create a list of items to match here:
comp_match = list(comp_df["MatchPoint"].unique()) #unique org names from company watch file
#Building the TFIDF off the clean dataset - takes about 5 min
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(comp_match)

##### Create a list of messy items to match here:
local_match = list(local_df["MatchPoint"].unique()) #unique list of names


messy_tf_idf_matrix = vectorizer.transform(local_match)

In [33]:
# create a random matrix to index
data_matrix = tf_idf_matrix#[0:1000000]

# Set index parameters
# These are the most important ones
M = 80
efC = 1000

num_threads = 4 # adjust for the number of threads
# Intitialize the library, specify the space, the type of the vector and add data points 
index = nmslib.init(method='simple_invindx', space='negdotprod_sparse_fast', data_type=nmslib.DataType.SPARSE_VECTOR) 

index.addDataPointBatch(data_matrix)
# Create an index
index.createIndex() 


In [34]:
# Number of neighbors 
num_threads = 4
K=1
query_matrix = messy_tf_idf_matrix

query_qty = query_matrix.shape[0]
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)

In [35]:
mts =[]
for i in range(len(nbrs)):
  original_nm = local_match[i]
  try:
    matched_nm   = comp_match[nbrs[i][0][0]]
    conf         = nbrs[i][1][0] * -1
  except:
    matched_nm   = "no match found"
    conf         = None
  mts.append([original_nm,matched_nm,conf])

mts = pd.DataFrame(mts,columns=['comp_match','local_match','conf'])

In [36]:
mts = mts.sort_values(by=['conf'])
mts

Unnamed: 0,comp_match,local_match,conf
2527,1972- author Lilja Sigurðardóttir Trap,Julie Tieu The Donut Trap,0.138503
1233,Alastair Reynolds Permafrost,A.C. Arthur The Last Affair,0.146572
1190,María Gainza Optic nerve,Sarah Gailey Taste of Marrow,0.147769
2388,Seth Dickinson The tyrant Baru Cormorant,Ruth Dickey Mud Blooms,0.148670
660,Owen Laukkanen Gale force,Ed Morales Latinx: The New Force in American P...,0.152833
...,...,...,...
2009,Chanel Cleeton The last train to Key West,Chanel Cleeton The Last Train to Key West,1.000000
0,Elif Shafak 10 minutes 38 seconds in this stra...,Elif Shafak 10 Minutes 38 Seconds in This Stra...,1.000000
2628,Sigrid Nunez What are you going through,Sigrid Nunez What Are You Going Through,1.000000
465,James McBride Deacon King Kong,James McBride Deacon King Kong,1.000000


In [37]:
mts[mts['local_match'].str.match('Daniel José')]

Unnamed: 0,comp_match,local_match,conf
1674,Lisa Wingate The book of lost friends : a novel,Daniel José Older The Book of Lost Saints,0.325401
1675,Kristin Harmel The book of lost names : a novel,Daniel José Older The Book of Lost Saints,0.335621
1676,Daniel José Older The book of lost saints,Daniel José Older The Book of Lost Saints,1.0


In [22]:
mts[mts['conf']<= .66]

Unnamed: 0,comp_match,local_match,conf
22351,B Akunin Ubitʹ zmeenysha : pʹesa,Victoria Chang Obit,0.093450
9534,Alekseĭ Zhivoĭ Legion : Pryzhok lʹva ; Ispansk...,Nikki Giovanni Make Me Rain: Poems & Prose,0.095440
2115,Zaynab ʻAfīfī Aḥlum wa-anā bi-jiwārak : riwāyah,Zakiyyah Iman Jackson Becoming Human: Matter a...,0.098011
13706,V V Erofeev Roz︠h︡eva Mysha : Nevelykyĭ mahich...,Ruha Benjamin Race After Technology: Abolition...,0.098256
22354,N Zhivotov Ubiĭt︠s︡a,"Lun Zhang Adrien Gombeaud, and Améziane, Tiana...",0.098477
...,...,...,...
13464,Rebecca Roanhorse Resistance reborn,Rebecca Roanhorse Black Sun,0.652681
687,M J Tjia A necessary murder,M.J. Tija A Necessary Murder,0.654089
4391,Pola Oloixarac Dark constellations,Pola Oloixarac Mona,0.654350
16278,Emma Donoghue The Lotterys plus one,Emma Donoghue Akin,0.656727


In [None]:
mts.to_excel("matched_output.xlsx")

In [38]:
print('estimated percentage of titles in collection '+ str(round((len(mts.loc[mts['conf'] > .66]) / len(comp_df))*100,2)) +'%')

estimated percentage of titles in collection 8.17%


In [39]:
fiction_df = comp_df[comp_df['Genre']=='Fiction']

In [40]:
fiction_df

Unnamed: 0,Genre,Author,Title,Year,MatchPoint
0,Fiction,Charlie Adhara,The Wolf at the Door,2018.0,Charlie Adhara The Wolf at the Door
1,Fiction,Charlie Adhara,The Wolf at Bay,2018.0,Charlie Adhara The Wolf at Bay
2,Fiction,Charlie Adhara,Thrown to the Wolves,2019.0,Charlie Adhara Thrown to the Wolves
3,Fiction,Charlie Adhara,Wolf in Sheep’s Clothing,2020.0,Charlie Adhara Wolf in Sheep’s Clothing
4,Fiction,Brea Alepoú,His Bewildered Mate,2019.0,Brea Alepoú His Bewildered Mate
...,...,...,...,...,...
4012,Fiction,,Slay: Stories of the Vampire Noire,2020.0,Slay: Stories of the Vampire Noire
4013,Fiction,,Transcendent 3: The Year’s Best Transgender Sp...,2018.0,Transcendent 3: The Year’s Best Transgender S...
4014,Fiction,,Transcendent 4,2019.0,Transcendent 4
4015,Fiction,,Walking the Clouds: An Anthology of Indigenous...,2012.0,Walking the Clouds: An Anthology of Indigenou...


In [41]:
print('estimated percentage of titles in fiction collection '+ str(round((len(mts.loc[mts['conf'] > .66]) / len(fiction_df))*100,2)) +'%')

estimated percentage of titles in fiction collection 8.17%


In [44]:
fiction_result = pd.merge(fiction_df,mts[mts['conf']> .66], left_on='MatchPoint',right_on='comp_match', how='left')

In [45]:
fiction_result

Unnamed: 0,Genre,Author,Title,Year,MatchPoint,comp_match,local_match,conf
0,Fiction,Charlie Adhara,The Wolf at the Door,2018.0,Charlie Adhara The Wolf at the Door,,,
1,Fiction,Charlie Adhara,The Wolf at Bay,2018.0,Charlie Adhara The Wolf at Bay,,,
2,Fiction,Charlie Adhara,Thrown to the Wolves,2019.0,Charlie Adhara Thrown to the Wolves,,,
3,Fiction,Charlie Adhara,Wolf in Sheep’s Clothing,2020.0,Charlie Adhara Wolf in Sheep’s Clothing,,,
4,Fiction,Brea Alepoú,His Bewildered Mate,2019.0,Brea Alepoú His Bewildered Mate,,,
...,...,...,...,...,...,...,...,...
4012,Fiction,,Slay: Stories of the Vampire Noire,2020.0,Slay: Stories of the Vampire Noire,,,
4013,Fiction,,Transcendent 3: The Year’s Best Transgender Sp...,2018.0,Transcendent 3: The Year’s Best Transgender S...,,,
4014,Fiction,,Transcendent 4,2019.0,Transcendent 4,,,
4015,Fiction,,Walking the Clouds: An Anthology of Indigenous...,2012.0,Walking the Clouds: An Anthology of Indigenou...,,,


In [23]:
fiction_result.to_excel("fiction_matched_output.xlsx")

In [27]:
mts[mts["comp_match"].str.match('Rebecca Roanhorse')]

Unnamed: 0,comp_match,local_match,conf
13464,Rebecca Roanhorse Resistance reborn,Rebecca Roanhorse Black Sun,0.652681
13170,Rebecca Roanhorse Race to the sun,Rebecca Roanhorse Black Sun,0.815647
15218,Rebecca Roanhorse Storm of locusts,Rebecca Roanhorse Storm of Locusts,1.0
22080,Rebecca Roanhorse Trail of lightning,Rebecca Roanhorse Trail of Lightning,1.0
2752,Rebecca Roanhorse Black sun,Rebecca Roanhorse Black Sun,1.0
