In [1]:
import sqlite3
import pandas as pd
import psycopg2
import configparser

In [2]:
try:
    sqliteConnection = sqlite3.connect('DEI Database.db')
    cursor = sqliteConnection.cursor()
    print("Connected to SQLite")
    
    sql_query = """SELECT *
                FROM title_list
                """
    cursor.execute(sql_query)
    sqliteConnection.commit()
    dei_rows = cursor.fetchall()
    cursor.close()
except sqlite3.Error as error:
    print("Failed to run query", error)   

Connected to SQLite


In [3]:
column_names = ["Genre","Author","Title","Year"]
dei_df = pd.DataFrame(dei_rows, columns=column_names)
dei_df

Unnamed: 0,Genre,Author,Title,Year
0,Romance/Erotic Romance,Charlie Adhara,The Wolf at the Door,2018.0
1,Romance/Erotic Romance,Charlie Adhara,The Wolf at Bay,2018.0
2,Romance/Erotic Romance,Charlie Adhara,Thrown to the Wolves,2019.0
3,Romance/Erotic Romance,Charlie Adhara,Wolf in Sheep’s Clothing,2020.0
4,Romance/Erotic Romance,Brea Alepoú,His Bewildered Mate,2019.0
...,...,...,...,...
3954,Fiction Anthologies,,Shades Of Black: Crime And Mystery Stories By ...,2004.0
3955,Fiction Anthologies,,Slay: Stories of the Vampire Noire,2020.0
3956,Fiction Anthologies,,Transcendent 3: The Year’s Best Transgender Sp...,2018.0
3957,Fiction Anthologies,,Transcendent 4,2019.0


In [4]:
config = configparser.ConfigParser()
config.read('Y:\\SQL Reports\\creds\\app_SIC.ini')

try:
    query = """SELECT
b.best_title AS title,
COALESCE(REPLACE(SPLIT_PART(SPLIT_PART(b.best_author,' (',1),', ',2),'.','')||' '||SPLIT_PART(b.best_author,', ',1),'') AS author,
b.publish_year,
STRING_AGG(DISTINCT i.location_code,',') AS location
FROM
sierra_view.bib_record_property b
JOIN
sierra_view.bib_record_item_record_link l
ON
b.bib_record_id = l.bib_record_id
JOIN
sierra_view.item_record i
ON
l.item_record_id = i.id
WHERE
b.material_code = 'a'
AND b.publish_year >= 2018
AND i.location_code ~ '^lin'

GROUP BY 1,2,3
ORDER BY 1,2
    """
    #variable connection string should be defined in the imported config file
    conn = psycopg2.connect( config['db']['connection_string'] )
except:
    print("unable to connect to the database")
    clear_connection()
cursor = conn.cursor()
cursor.execute(query)
#For now, just storing the data in a variable. We'll use it later.
sierra_rows = cursor.fetchall()
conn.close()

In [5]:
column_names_sierra = ["Title", "Author", "Year", "Location"]
sierra_df = pd.DataFrame(sierra_rows, columns=column_names_sierra)
sierra_df

Unnamed: 0,Title,Author,Year,Location
0,#MeToo and you : everything you need to know a...,Halley Bondy,2021,liny
1,#wanderlust : the world's 500 most unforgettab...,Sabina Trojanova,2020,linan
2,(((Semitism))) : being Jewish in America in th...,Jonathan Weisman,2018,lina
3,(It's great to) suck at something : the unexpe...,Karen Rinaldi,2019,lina
4,1 2 3 Cats,Lesléa Newman,2021,linj
...,...,...,...,...
11177,iPhone for seniors for dummies,Dwight Spivey,2018,lina
11178,kimotinâniwiw itwêwina,Melanie Florence,2019,linj
11179,¡Vamos! : let's go eat,1976- author Raúl the Third,2020,linj
11180,¡Vamos! Let's go to the market,1976- author Raúl the Third,2019,linj


In [7]:
import pandas as pd
import numpy as np
import os
import re
import time
import re

#!pip install ftfy #  text cleaning for decode issues..

#from ftfy import fix_text
def ngrams(string, n=3):
    """Takes an input string, cleans it and converts to ngrams. 
    This script is focussed on cleaning UK company names but can be made generic by removing lines below"""
    string = str(string)
    string = string.lower() # lower case
    #string = fix_text(string) # fix text
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    chars_to_remove = [")","(",".","|","[","]","{","}","'","-"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']' #remove punc, brackets etc...
    string = re.sub(rx, '', string)
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]
'''
input1_csv = 'Gov Orgs ONS.csv'
input1_column = 'Institutions'
input2_csv = 'messy org names.csv'
input2_column = 'buyer'
'''

###FIRST TIME RUN - takes about 5 minutes... used to build the matching table
from sklearn.feature_extraction.text import TfidfVectorizer
import time
t1 = time.time() # used for timing - can delete
##### Create a list of items to match here:
dei_names = list(dei_df["Author"].unique()) #unique org names from company watch file
#Building the TFIDF off the clean dataset - takes about 5 min
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(dei_names)
t = time.time()-t1
print("Time:", t) # used for timing - can delete
print(tf_idf_matrix.shape)

import time
t1 = time.time()
##### Create a list of messy items to match here:
sierra_names = list(sierra_df["Author"].unique()) #unique list of names

#Creation of vectors for the messy names

# #FOR LOADING ONLY - only required if items have been saved previously
# vectorizer = pickle.load(open("Data/vectorizer.pkl","rb"))
# tf_idf_matrix = pickle.load(open("Data/Comp_tfidf.pkl","rb"))
# org_names = pickle.load(open("Data/Comp_names.pkl","rb"))

messy_tf_idf_matrix = vectorizer.transform(sierra_names)

Time: 0.17818117141723633
(2488, 4713)


In [8]:
import nmslib
from scipy.sparse import csr_matrix # may not be required 
from scipy.sparse import rand # may not be required


# create a random matrix to index
data_matrix = tf_idf_matrix#[0:1000000]

# Set index parameters
# These are the most important ones
M = 80
efC = 1000

num_threads = 4 # adjust for the number of threads
# Intitialize the library, specify the space, the type of the vector and add data points 
index = nmslib.init(method='simple_invindx', space='negdotprod_sparse_fast', data_type=nmslib.DataType.SPARSE_VECTOR) 

index.addDataPointBatch(data_matrix)
# Create an index
start = time.time()
index.createIndex() 
end = time.time() 
print('Indexing time = %f' % (end-start))

Indexing time = 0.010952


In [9]:
# Number of neighbors 
num_threads = 4
K=1
query_matrix = messy_tf_idf_matrix
start = time.time() 
query_qty = query_matrix.shape[0]
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time() 
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))

kNN time total=0.130250 (sec), per query=0.000018 (sec), per query adjusted for thread number=0.000071 (sec)


In [11]:
mts =[]
for i in range(len(nbrs)):
  original_nm = dei_names[i]
  try:
    matched_nm   = dei_names[nbrs[i][0][0]]
    conf         = nbrs[i][1][0]
  except:
    matched_nm   = "no match found"
    conf         = None
  mts.append([original_nm,matched_nm,conf])

mts = pd.DataFrame(mts,columns=['dei_Author','matched_Author','conf'])

IndexError: list index out of range

In [None]:

results = df_CF.merge(mts,left_on='Author',right_on='original_nm')