In [1]:
import sqlite3
import pandas as pd
import psycopg2
import configparser
import fuzzymatcher

In [2]:
try:
    sqliteConnection = sqlite3.connect('DEI Database.db')
    cursor = sqliteConnection.cursor()
    print("Connected to SQLite")
    
    sql_query = """SELECT *
                FROM title_list
                """
    cursor.execute(sql_query)
    sqliteConnection.commit()
    dei_rows = cursor.fetchall()
    cursor.close()
except sqlite3.Error as error:
    print("Failed to run query", error)   

Connected to SQLite


In [3]:
column_names = ["Genre","Author","Title","Year"]
dei_df = pd.DataFrame(dei_rows, columns=column_names)

In [23]:
dei_df

Unnamed: 0,Genre,Author,Title,Year
0,Romance/Erotic Romance,Charlie Adhara,The Wolf at the Door,2018.0
1,Romance/Erotic Romance,Charlie Adhara,The Wolf at Bay,2018.0
2,Romance/Erotic Romance,Charlie Adhara,Thrown to the Wolves,2019.0
3,Romance/Erotic Romance,Charlie Adhara,Wolf in Sheep’s Clothing,2020.0
4,Romance/Erotic Romance,Brea Alepoú,His Bewildered Mate,2019.0
...,...,...,...,...
3954,Fiction Anthologies,,Shades Of Black: Crime And Mystery Stories By ...,2004.0
3955,Fiction Anthologies,,Slay: Stories of the Vampire Noire,2020.0
3956,Fiction Anthologies,,Transcendent 3: The Year’s Best Transgender Sp...,2018.0
3957,Fiction Anthologies,,Transcendent 4,2019.0


In [18]:
config = configparser.ConfigParser()
config.read('Y:\\SQL Reports\\creds\\app_SIC.ini')

try:
    query = """SELECT
b.best_title AS title,
COALESCE(REPLACE(SPLIT_PART(SPLIT_PART(b.best_author,' (',1),', ',2),'.','')||' '||SPLIT_PART(b.best_author,', ',1),'') AS author,
b.publish_year,
STRING_AGG(DISTINCT i.location_code,',') AS location
FROM
sierra_view.bib_record_property b
JOIN
sierra_view.bib_record_item_record_link l
ON
b.bib_record_id = l.bib_record_id
JOIN
sierra_view.item_record i
ON
l.item_record_id = i.id
WHERE
b.material_code = 'a'
AND b.publish_year >= 2018
AND i.location_code ~ '^fpl'

GROUP BY 1,2,3
ORDER BY 1,2
    """
    #variable connection string should be defined in the imported config file
    conn = psycopg2.connect( config['db']['connection_string'] )
except:
    print("unable to connect to the database")
    clear_connection()
cursor = conn.cursor()
cursor.execute(query)
#For now, just storing the data in a variable. We'll use it later.
sierra_rows = cursor.fetchall()
conn.close()

In [19]:
column_names_sierra = ["Title", "Author", "Year", "Location"]
sierra_df = pd.DataFrame(sierra_rows, columns=column_names_sierra)

In [20]:
sierra_df

Unnamed: 0,Title,Author,Year,Location
0,El Nuevo Bebé (the New Baby) (Ana & Andrew S...,Christine Platt,2021,fpljm
1,!Mi amigo esta triste!,Mo Willems,2019,fpljm
2,"""Daddy why am I brown?"" : a healthy conversati...",Bedford E F II Palmer,2020,fplj
3,"""Frankly, we did win this election"" : the insi...",Michael C Bender,2021,fplan
4,"""My brothers have my back"" : inside the Novemb...",Lou Pepi,2018,fpla
...,...,...,...,...
17664,¿Quién es Carmen Sandiego?,Rebecca Tinker,2019,fplj
17665,¿Quién fue John F. Kennedy?,Yona Zeldis McDonough,2018,fpljm
17666,¿Y si fuéramos nosotros?,Becky Albertalli,2019,fply
17667,À table : recipes for cooking + eating the Fre...,Rebekah Peppler,2021,fplan


In [29]:
import fuzzymatcher

# Columns to match on from df_left
left_on = ["Author", "Year", "Title"]

# Columns to match on from df_right
right_on = ["Author", "Year", "Title"]

# The link table potentially contains several matches for each record
merged_df = fuzzymatcher.fuzzy_left_join(dei_df, sierra_df, left_on, right_on)

In [31]:
merged_df = merged_df.sort_values(by=['best_match_score'])
merged_df.loc[merged_df['best_match_score'] >= .5,['best_match_score','Author_left','Author_right','Year_left','Year_right','Title_left','Title_right','Genre','Location']]

Unnamed: 0,best_match_score,Author_left,Author_right,Year_left,Year_right,Title_left,Title_right,Genre,Location
489216,0.511178,Isabel Allende,Isabel Allende,2021.0,2021.0,The Soul of a Woman,"The soul of a woman : on impatient love, long ...",Nonfiction,"fpla,fplan"
249686,0.534923,Gledé Browne Kabongo,Gledé Browne Kabongo,2016.0,2018.0,Game of Fear,Autumn of fear : a fearless novel,Myster/Thriller,fpla
473679,0.535600,Isaac Mizrahi,Isaac Mizrahi,2019.0,2019.0,I.M.: A Memoir,IM : a memoir,Biography/Memoir,fpla
541553,0.550882,Emma Dabiri,Sharee Miller,2019.0,2018.0,Don’t Touch My Hair,Don't touch my hair!,Black Lives and Anti-Racism,fplj
251335,0.553768,Jennifer J. Chow,Jennifer J Chow,2020.0,2020.0,Mimi Lee Gets a Clue,Mimi Lee reads between the lines,Myster/Thriller,fplan
...,...,...,...,...,...,...,...,...,...
534781,4.422029,Andrew Yang,Andrew Yang,2018.0,2018.0,The War on Normal People: The Truth About Amer...,The war on normal people : the truth about Ame...,Nonfiction,fpla
566967,4.483388,DaMaris B. Hill,DaMaris B Hill,2019.0,2019.0,A Bound Woman Is a Dangerous Thing: The Incarc...,A bound woman is a dangerous thing : the incar...,History,fpla
566612,4.870871,Kim Ghattas,Kim Ghattas,2020.0,2020.0,"Black Wave: Saudi Arabia, Iran, and the Forty-...","Black wave : Saudi Arabia, Iran, and the forty...",History,fpla
559606,4.893582,Anna Malaika Tubbs,Anna Malaika Tubbs,2021.0,2021.0,The Three Mothers: How the Mothers of Martin L...,The three mothers : how the mothers of Martin ...,Black Lives and Anti-Racism,fplan


In [40]:
len(merged_df.loc[merged_df['best_match_score'] >= .5]) / len(dei_df)

0.21798433947966658

In [14]:
merged_df.to_excel("output.xlsx")

In [None]:
import jellyfish

def get_closest_match(x, list_strings):

  best_match = None
  highest_jw = 0

  for current_string in list_strings:
    current_score = jellyfish.jaro_winkler(x, current_string)

    if(current_score > highest_jw):
      highest_jw = current_score
      best_match = current_string

  return best_match

sierra_df['Author'] = sierra_df['Author'].apply(lambda x: get_closest_match(x, dei_df['Author'])[0])

df_merge = df1.join(df2, on='Author')

In [None]:
try:
    sqliteConnection = sqlite3.connect('DEI Database.db')
    cursor = sqliteConnection.cursor()
    print("Connected to SQLite")
    
    sql_query = """SELECT LOWER(REPLACE(Title,'''',''))||LOWER(REPLACE(Author,'''','')) 
                FROM title_list
                """
    cursor.execute(sql_query)
    sqliteConnection.commit()
    title_match = cursor.fetchall()
    title_string = [t[0] for t in title_match]
    cursor.close()
except sqlite3.Error as error:
    print("Failed to run query", error)   

In [None]:
title_string

In [None]:
new_title_string = str([str(x).lstrip('\'') for x in title_string]).strip("[]")

In [None]:
new_title_string

In [None]:
try:
    query = """\
    SELECT
    b.best_title AS title,
    REPLACE(SPLIT_PART(SPLIT_PART(b.best_author,' (',1),', ',2),'.','')||' '||SPLIT_PART(b.best_author,', ',1) AS author,
    SUBSTRING(i.location_code FROM 1 FOR 3) AS location,
    CASE
    WHEN COUNT(i.id) != 0 THEN TRUE
    ELSE FALSE
    END AS owns
    FROM
    sierra_view.bib_record_property b
    JOIN
    sierra_view.bib_record_item_record_link l
    ON
    b.bib_record_id = l.bib_record_id
    JOIN
    sierra_view.item_record i
    ON
    l.item_record_id = i.id
    WHERE
    b.material_code = 'a'
    AND lower(b.best_title)||LOWER(REPLACE(SPLIT_PART(SPLIT_PART(b.best_author,' (',1),', ',2),'.','')||' '||SPLIT_PART(b.best_author,', ',1)) IN ("""\
    +new_title_string+"""\
    )
    AND b.publish_year >= 2018

    GROUP BY 1,2,3
    ORDER BY 1,2,3
    """
    #variable connection string should be defined in the imported config file
    conn = psycopg2.connect( config['db']['connection_string'] )
except:
    print("unable to connect to the database")
    clear_connection()
cursor = conn.cursor()
cursor.execute(query)
#For now, just storing the data in a variable. We'll use it later.
sierra_rows = cursor.fetchall()
conn.close()

In [None]:
sierra_rows