In [1]:
import sqlite3
import pandas as pd
import psycopg2
import configparser

In [2]:
try:
    sqliteConnection = sqlite3.connect('DEI Database.db')
    cursor = sqliteConnection.cursor()
    print("Connected to SQLite")
    
    sql_query = """SELECT Genre,COALESCE(Author,'') AS Author,Title,Year,COALESCE(Author,'')||' '||Title AS match_point
                FROM title_list
                """
    cursor.execute(sql_query)
    sqliteConnection.commit()
    dei_rows = cursor.fetchall()
    cursor.close()
except sqlite3.Error as error:
    print("Failed to run query", error)   

Connected to SQLite


In [3]:
column_names = ["Genre","Author","Title","Year","MatchPoint"]
dei_df = pd.DataFrame(dei_rows, columns=column_names)
dei_df

Unnamed: 0,Genre,Author,Title,Year,MatchPoint
0,Romance/Erotic Romance,Charlie Adhara,The Wolf at the Door,2018.0,Charlie Adhara The Wolf at the Door
1,Romance/Erotic Romance,Charlie Adhara,The Wolf at Bay,2018.0,Charlie Adhara The Wolf at Bay
2,Romance/Erotic Romance,Charlie Adhara,Thrown to the Wolves,2019.0,Charlie Adhara Thrown to the Wolves
3,Romance/Erotic Romance,Charlie Adhara,Wolf in Sheep’s Clothing,2020.0,Charlie Adhara Wolf in Sheep’s Clothing
4,Romance/Erotic Romance,Brea Alepoú,His Bewildered Mate,2019.0,Brea Alepoú His Bewildered Mate
...,...,...,...,...,...
3954,Fiction Anthologies,,Shades Of Black: Crime And Mystery Stories By ...,2004.0,Shades Of Black: Crime And Mystery Stories By...
3955,Fiction Anthologies,,Slay: Stories of the Vampire Noire,2020.0,Slay: Stories of the Vampire Noire
3956,Fiction Anthologies,,Transcendent 3: The Year’s Best Transgender Sp...,2018.0,Transcendent 3: The Year’s Best Transgender S...
3957,Fiction Anthologies,,Transcendent 4,2019.0,Transcendent 4


In [4]:
config = configparser.ConfigParser()
config.read('Y:\\SQL Reports\\creds\\app_SIC.ini')

try:
    query = """SELECT
b.best_title AS title,
COALESCE(REPLACE(SPLIT_PART(SPLIT_PART(b.best_author,' (',1),', ',2),'.','')||' '||SPLIT_PART(b.best_author,', ',1),'') AS author,
b.publish_year,
STRING_AGG(DISTINCT i.location_code,',') AS location,
COALESCE(REPLACE(SPLIT_PART(SPLIT_PART(b.best_author,' (',1),', ',2),'.','')||' '||SPLIT_PART(b.best_author,', ',1),'')||' '||b.best_title AS match_point

FROM
sierra_view.bib_record_property b
JOIN
sierra_view.bib_record_item_record_link l
ON
b.bib_record_id = l.bib_record_id
JOIN
sierra_view.item_record i
ON
l.item_record_id = i.id
WHERE
b.material_code = 'a'
AND b.publish_year >= 2018
--AND i.location_code ~ '^lin'

GROUP BY 1,2,3
ORDER BY 1,2
    """
    #variable connection string should be defined in the imported config file
    conn = psycopg2.connect( config['db']['connection_string'] )
except:
    print("unable to connect to the database")
    clear_connection()
cursor = conn.cursor()
cursor.execute(query)
#For now, just storing the data in a variable. We'll use it later.
sierra_rows = cursor.fetchall()
conn.close()

In [5]:
column_names_sierra = ["Title", "Author", "Year", "Location", "MatchPoint"]
sierra_df = pd.DataFrame(sierra_rows, columns=column_names_sierra)
sierra_df

Unnamed: 0,Title,Author,Year,Location,MatchPoint
0,Amanda's Dream (English Hebrew Bilingual Book),Shelley Admont,2019,fp2j,Shelley Admont Amanda's Dream (English Hebre...
1,El Nuevo Bebé (the New Baby) (Ana & Andrew S...,Christine Platt,2021,"fp2jm,fpljm",Christine Platt El Nuevo Bebé (the New Baby)...
2,After High School: A Guide To Help You,Brian Harris,2020,ntna,Brian Harris After High School: A Guide To He...
3,Arsène Lupin Versus Herlock Sholmes,Maurice Leblanc,2021,ntna,Maurice Leblanc Arsène Lupin Versus Herlock S...
4,Butterflies and Second Chances: A Mom's Memoi...,Annette Hines,2019,ntnt,Annette Hines Butterflies and Second Chances:...
...,...,...,...,...,...
109765,Ṭayf al-Ḥallāj : riwāyah,Maqbūl Mūsá ʻAlawī,2018,ntnae,Maqbūl Mūsá ʻAlawī Ṭayf al-Ḥallāj : riwāyah
109766,Ṭopī Śuklā,1927-1992 Rāhī Māsūma Razā,2019,acta,1927-1992 Rāhī Māsūma Razā Ṭopī Śuklā
109767,Ẓilāl al-ākharīn : riwāyah,Manāf Zaytūn,2018,ntnae,Manāf Zaytūn Ẓilāl al-ākharīn : riwāyah
109768,偷溜蛋,約翰 (John,2020,wlmj,約翰 (John 偷溜蛋


In [6]:
import numpy as np
import os
import re
import time

#!pip install ftfy #  text cleaning for decode issues..

#from ftfy import fix_text
def ngrams(string, n=3):
    """Takes an input string, cleans it and converts to ngrams. 
    This script is focussed on cleaning UK company names but can be made generic by removing lines below"""
    string = str(string)
    string = string.lower() # lower case
    #string = fix_text(string) # fix text
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    chars_to_remove = [")","(",".","|","[","]","{","}","'","-"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']' #remove punc, brackets etc...
    string = re.sub(rx, '', string)
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]


###FIRST TIME RUN - takes about 5 minutes... used to build the matching table
from sklearn.feature_extraction.text import TfidfVectorizer
import time
t1 = time.time() # used for timing - can delete
##### Create a list of items to match here:
dei_match = list(dei_df["MatchPoint"].unique()) #unique org names from company watch file
#Building the TFIDF off the clean dataset - takes about 5 min
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(dei_match)
t = time.time()-t1
print("Time:", t) # used for timing - can delete
print(tf_idf_matrix.shape)

import time
t1 = time.time()
##### Create a list of messy items to match here:
sierra_match = list(sierra_df["MatchPoint"].unique()) #unique list of names

#Creation of vectors for the messy names

# #FOR LOADING ONLY - only required if items have been saved previously
# vectorizer = pickle.load(open("Data/vectorizer.pkl","rb"))
# tf_idf_matrix = pickle.load(open("Data/Comp_tfidf.pkl","rb"))
# org_names = pickle.load(open("Data/Comp_names.pkl","rb"))

messy_tf_idf_matrix = vectorizer.transform(sierra_match)

Time: 0.2453305721282959
(3950, 6862)


In [7]:
import nmslib
from scipy.sparse import csr_matrix # may not be required 
from scipy.sparse import rand # may not be required


# create a random matrix to index
data_matrix = tf_idf_matrix#[0:1000000]

# Set index parameters
# These are the most important ones
M = 80
efC = 1000

num_threads = 4 # adjust for the number of threads
# Intitialize the library, specify the space, the type of the vector and add data points 
index = nmslib.init(method='simple_invindx', space='negdotprod_sparse_fast', data_type=nmslib.DataType.SPARSE_VECTOR) 

index.addDataPointBatch(data_matrix)
# Create an index
start = time.time()
index.createIndex() 
end = time.time() 
print('Indexing time = %f' % (end-start))

Indexing time = 0.026931


In [8]:
# Number of neighbors 
num_threads = 4
K=1
query_matrix = messy_tf_idf_matrix
start = time.time() 
query_qty = query_matrix.shape[0]
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time() 
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))

kNN time total=15.233992 (sec), per query=0.000142 (sec), per query adjusted for thread number=0.000566 (sec)


In [9]:
mts =[]
for i in range(len(nbrs)):
  original_nm = sierra_match[i]
  try:
    matched_nm   = dei_match[nbrs[i][0][0]]
    conf         = nbrs[i][1][0]
  except:
    matched_nm   = "no match found"
    conf         = None
  mts.append([original_nm,matched_nm,conf])

mts = pd.DataFrame(mts,columns=['dei_match','sierra_match','conf'])

In [10]:
mts = mts.sort_values(by=['conf'])
mts

Unnamed: 0,dei_match,sierra_match,conf
24754,Khurrum Rahman East of Hounslow,Khurrum Rahman East of Hounslow,-1.000000
70343,Zuri Day Sin City vows,Zuri Day Sin City Vows,-1.000000
98315,TJ Klune Under the whispering door,T.J. Klune Under the Whispering Door,-1.000000
89959,Caroline Kim The prince of mournful thoughts a...,Caroline Kim The Prince of Mournful Thoughts a...,-1.000000
3681,Jude Sierra A tiny piece of something greater,Jude Sierra A Tiny Piece of Something Greater,-1.000000
...,...,...,...
44698,V Gubarev Korolevstvo krivykh zerkal : detskie...,Kris Ripper The Love Study,-0.094707
46118,Alekseĭ Zhivoĭ Legion : Pryzhok lʹva ; Ispansk...,Nikki Giovanni Make Me Rain: Poems & Prose,-0.094303
44717,Dzh E Braĭt Kotënok Shmi︠a︡k i zagadochnoe zer...,Megan Rapinoe One Life,-0.093772
97897,B Akunin Ubitʹ zmeenysha : pʹesa,Victoria Chang Obit,-0.090587


In [None]:
#mts.to_excel("matched_output.xlsx")

In [11]:
len(mts.loc[mts['conf'] < -.66]) / len(dei_df)

0.568072745642839

In [12]:
results = sierra_df.merge(mts.loc[mts['conf'] < -.66],left_on='MatchPoint',right_on='sierra_match')

In [13]:
results[['Title','Author','Year','Location','dei_match','conf']]

Unnamed: 0,Title,Author,Year,Location,dei_match,conf
0,Alienation,Inés Estrada,2019,"arla,blma,ca5a,maya,meda,soma",Inés Estrada Alienation,-1.0
1,All the Gay Saints,Kayleb Rae Candrilli,2020,ntna,Kayleb Rae Candrilli All the Gay Saints,-1.0
2,Anodyne,Khadijah Queen,2020,"cama,ddma,lexan,ntna",Khadijah Queen Anodyne,-1.0
3,Aria,Nazanine Hozar,2020,"arlan,bedan,blma,brka,ca4a,ca9a,cama,conan,fp2...",Nazanine Hozar Aria,-1.0
4,Assembly,Natasha Brown,2021,"ar2an,arlan,bedan,blman,br2an,brkan,ca5a,caman...",Natasha Brown Assembly,-1.0
...,...,...,...,...,...,...
148,Washington Black,Esi Edugyan,2019,"cama,camas,fp2a,fpla,lasa,lexa,nora,ntna,soma,...",Esi Edugyan Washington Black,-1.0
149,Welcome to Lagos,Chibundu Onuzo,2018,"arla,beda,blma,brka,ca5a,ca9a,cama,cona,fpla,l...",Chibundu Onuzo Welcome to Lagos,-1.0
150,Whereabouts,Jhumpa Lahiri,2021,"ac2a,actan,actas,ar2an,ar2as,arlan,arlas,ashan...",Jhumpa Lahiri Whereabouts,-1.0
151,Wonderland,Zoje Stage,2020,"actan,arla,asha,cama,fpla,lexan,mlda,mwya,neea...",Zoje Stage Wonderland,-1.0


In [16]:
results[['Title','Author','Year','Location','dei_match','conf']].to_excel("matched_holdings.xlsx")

In [14]:
mts.loc[mts['conf'] >= -.66]

Unnamed: 0,dei_match,sierra_match,conf
90281,Louise Erdrich The range eternal,Louise Erdrich The Sentence,-0.659733
12055,Nnedi Okorafor Binti : home,Nnedi Okorafor Noor,-0.658454
70437,Jacob Tobia Sissy,Jacob Tobia Sissy: A Coming-of-Gender Story,-0.658407
20810,Daniel R Day Dapper Dan : made in Harlem : a m...,"Daniel R. Day with Mikael Awake, Dapper Dan: M...",-0.658191
100358,Hasan Namir War / torn,Hasan Namir War/Torn,-0.657558
...,...,...,...
44698,V Gubarev Korolevstvo krivykh zerkal : detskie...,Kris Ripper The Love Study,-0.094707
46118,Alekseĭ Zhivoĭ Legion : Pryzhok lʹva ; Ispansk...,Nikki Giovanni Make Me Rain: Poems & Prose,-0.094303
44717,Dzh E Braĭt Kotënok Shmi︠a︡k i zagadochnoe zer...,Megan Rapinoe One Life,-0.093772
97897,B Akunin Ubitʹ zmeenysha : pʹesa,Victoria Chang Obit,-0.090587


In [None]:
dei_df

In [15]:
#doesn't work
missing_results = dei_df.merge(mts.loc[mts['conf'] > -.66],left_on='MatchPoint',right_on='dei_match')
missing_results[['Title','Author','Year','Genre','sierra_match','conf']]

Unnamed: 0,Title,Author,Year,Genre,sierra_match,conf
