# Jupyter notebook to get an overview of the different subject terms used at the IISH from four different sources
- dateCreated: 2025-01-06
- creator: Liliana Melgar

# Import libraries

In [None]:
import pandas as pd
import numpy as np
import csv
import re
import matplotlib.pyplot as plt

from IPython.display import display, HTML
from IPython.display import clear_output
display(HTML("<style>.container { width:95% !important; }</style>"))
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# import os.path to add paths to files
import os

# Set paths to files

In [None]:
# path to where the relevant data is located
# biblio
script_dir = os.getcwd()  # Gets the current working directory
project_root = os.path.abspath(os.path.join(script_dir, "..", ".."))  # Moves up two levels to reach 'repo'
data_directory_biblio = os.path.join(project_root, "data", "biblio")
data_downloads_biblio = os.path.join(data_directory_biblio, 'downloads') #path to the folder where the reports will be downloaded

# authority
script_dir = os.getcwd()  # Gets the current working directory
project_root = os.path.abspath(os.path.join(script_dir, "..", ".."))  # Moves up two levels to reach 'repo'
data_directory_authority = os.path.join(project_root, "data", "authority")
data_downloads_authority = os.path.join(data_directory_authority, 'downloads') #path to the folder where the reports will be downloaded

# subjects (thesauri)
script_dir = os.getcwd()  # Gets the current working directory
project_root = os.path.abspath(os.path.join(script_dir, "..", ".."))  # Moves up two levels to reach 'repo'
data_directory_subjects = os.path.join(project_root, "data", "subjects")
data_downloads_subjects = os.path.join(data_directory_subjects, 'downloads') #path to the folder where the reports will be downloaded

# persons
script_dir = os.getcwd()  # Gets the current working directory
project_root = os.path.abspath(os.path.join(script_dir, "..", ".."))  # Moves up two levels to reach 'repo'
data_directory_persons = os.path.join(project_root, "data", "persons")
data_downloads_persons = os.path.join(data_directory_subjects, 'downloads') #path to the folder where the reports will be downloaded


# BIBLIO Persons
- These terms are extracted from the IISH metadata using the public version of the OAI-PMH endpoint. For more information about what BIBLIO contains, see: https://confluence.socialhistoryservices.org/x/S4FeBw.
- The harvesting was done using the code from the "Metadata overviews" repository: https://github.com/lilimelgar/iisg-metadata-overviews
- **The harvesting date was April 4, 2025**.
- Using another jupyter notebook (https://github.com/lilimelgar/iisg-metadata-overviews/blob/main/biblio/src/biblio_query.ipynb) I created a slice of the entire metadata selecting only the MARC field 100, because 100 corresponds to the persons/authors/creators in MARC (https://www.loc.gov/marc/bibliographic/bd100.html)

In [None]:
# read csv as dataframe

biblio_persons_df_v0 = pd.read_csv(f'{data_downloads_biblio}/persons_100_subfields.csv', sep=",", low_memory=False)

# biblio_subjectTerms_df_v0 = pd.read_csv(f'{data_directory}/biblio_subjectTerms600.csv.gzip', sep=",", compression='gzip', low_memory=False)
# low_memory=False was set after this warning message: "/var/folders/3y/xbjxw0b94jxg6x2bcbyjsmmcgvnf7q/T/ipykernel_987/2912965462.py:3: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False."

In [None]:
biblio_persons_df_v0.info()

In [None]:
# convert datatypes and fill in empty values
df_columns = biblio_persons_df_v0.columns
for column in df_columns:
    dataType = biblio_persons_df_v0.dtypes[column]
    if dataType == np.float64:
        biblio_persons_df_v0[column] = biblio_persons_df_v0[column].fillna('null')
        biblio_persons_df_v0[column] = biblio_persons_df_v0[column].astype(str)
    if dataType == np.int_:
        biblio_persons_df_v0[column] = biblio_persons_df_v0[column].fillna('null')
        biblio_persons_df_v0[column] = biblio_persons_df_v0[column].astype(str)
    if dataType == object:
        biblio_persons_df_v0[column] = biblio_persons_df_v0[column].fillna('null')
        biblio_persons_df_v0[column] = biblio_persons_df_v0[column].astype(str)

In [None]:
# get an overview of the data
biblio_persons_df = biblio_persons_df_v0.copy()
biblio_persons_df.info(verbose = True, show_counts = True)

In [None]:
# get an overview of the data
biblio_persons_df.shape

In [None]:
# get an overview of the data
biblio_persons_df.head(10)

In [None]:
biblio_persons_df['001'].nunique()

## Records with persons

In [None]:
# create subset of biblio with relevant columns
biblio_persons_df_v0 = biblio_persons_df[['001','100', 'leader_code']]

In [None]:
biblio_persons_df_v0.info()

In [None]:
# Count occurrences of 'null' and 'notNull'
value_counts = biblio_persons_df_v0.groupby('100')['001'].nunique()
value_counts

In [None]:
# Get total unique '001' count
total_unique_ids = biblio_persons_df_v0['001'].nunique()
total_unique_ids

In [None]:
# Plot (Label mapping)
labels_mapping = {
    'notnull': f'records with field 100 ({value_counts.get("notnull", 0)})',
    'null': f'records without field 100 ({value_counts.get("null", 0)})'
}
custom_labels = [labels_mapping[label] for label in value_counts.index]

# Colors
colors = ['#90ee90','#cccccc']

# Step 4: Plot pie chart
fig1, ax = plt.subplots(figsize=(7,6))
wedges, texts, autotexts = ax.pie(
    value_counts, labels=custom_labels, autopct='%1.1f%%', colors=colors, startangle=90,
    wedgeprops={'edgecolor': 'white', 'linewidth': 2}, pctdistance=0.85
)

# Donut hole
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig1.gca().add_artist(centre_circle)

# Center text
ax.text(0, 0, f'Total number of records in Biblio\n{total_unique_ids}', 
        ha='center', va='center', fontsize=7, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
name_file = 'plot1_recordsWithPerson100'
fig1.savefig(f'{data_downloads_subjects}/{name_file}.png', format='png', dpi=300, bbox_inches='tight')

### report including leader

In [None]:
# group by null and notnull and leader code
value_counts_with_leader = biblio_persons_df_v0.groupby(['100', 'leader_code'])['001'].count()
value_counts_with_leader.info()

In [None]:
import plotly.express as px

# Assuming `result2` is a MultiIndex Series from groupby:
# Convert to DataFrame and reset index
df = value_counts_with_leader.reset_index(name='count')

# Create a sunburst plot (nested pie chart)
fig = px.sunburst(
    df,
    path=['100', 'leader_code'],  # First level: 'has'/'doesn't have', second level: 'code'
    values='count',
    title='Distribution of Records by Person Presence and Code'
)
fig.show()


In [None]:
import plotly.express as px

df = value_counts_with_leader.reset_index(name='count')

fig = px.treemap(
    df,
    path=['100', 'leader_code'],  # hierarchy levels
    values='count',
    title='Treemap: Record Distribution by Person Presence and Code'
)

fig.show()

In [None]:
df = value_counts_with_leader.reset_index(name='count')

fig = px.bar(
    df,
    x='leader_code',
    y='count',
    color='100',
    barmode='group',
    title='Grouped Bar Chart: Record Counts by Code and 100'
)

fig.show()

## Unique subject terms

In [None]:
# using strings
biblio_persons_df['"a"'].nunique()

In [None]:
# using Ids from authority
biblio_persons_df['"0"'].nunique()

In [None]:
biblio_persons_df.tail()

In [None]:
# example of a term that doesn't have an identifier in 650$0
test_term1 = biblio_persons_df[
    (biblio_persons_df['"0"'].str.lower() == "null") &
    (biblio_persons_df['"a"'].str.lower() != "null")
]

In [None]:
test_term1.head(100)

In [None]:
test_term1.shape

In [None]:
test_term1['001'].unique().tolist()

In [None]:
test_term1['001'].nunique()

### Questions about the number of unique subject terms: 
1) why are the unique counts different? are the identifiers from 650$0 added only when a term is entered in Authorities? why are there terms without field 650$0?
2) In the test term below, why do the inconsistencies occur? I noticed that if one picks up a term from authorities, both the id and the string are loaded, but the string can be changed. -> shouldn't we try to lock the edit in Biblio?

In [None]:
# example to test why there is no one-to-one correspondence between the string and the id in the correspondent authority
test_term2 = biblio_persons_df[biblio_persons_df['"001"'].str.contains("1050212", case=False, regex=True)]
test_term2

In [None]:
test_term_strings = test_term2['"0"'].unique().tolist()
for term in test_term_strings:
    print(term)

In [None]:
test_term_strings = test_term2['"a"'].unique().tolist()
for term in test_term_strings:
    print(term)

In [None]:
# create a df groupping per term showing the counts
# Group by 'term' and count unique '001' values for each term
biblio_subjectUniqueTerms_v0 = biblio_subjectTerms_df.groupby('"a"', as_index=False).agg(
    count_of_records=('001', 'nunique'),
    ids=('001', lambda x: ','.join(map(str, x)))
)

# Sort by 'count_of_ids' in descending order (most frequent to least frequent)
biblio_subjectUniqueTerms_v1 = biblio_subjectUniqueTerms_v0.sort_values(by='count_of_records', ascending=False)

In [None]:
biblio_subjectUniqueTerms_v1.head()

In [None]:
# remove column with the record Ids
biblio_subjectUniqueTerms_v2 = biblio_subjectUniqueTerms_v1[['"a"','count_of_records']]

In [None]:
biblio_subjectUniqueTerms_v2.info()

In [None]:
# dropping the row that contains the null values
biblio_subjectUniqueTerms_v3 = biblio_subjectUniqueTerms_v2[biblio_subjectUniqueTerms_v2['"a"'] != 'null']

In [None]:
biblio_subjectUniqueTerms_v3.info()

In [None]:
biblio_subjectUniqueTerms_v3.head()

In [None]:
biblio_subjectUniqueTerms_v4 = biblio_subjectUniqueTerms_v3.reset_index(drop=True)

In [None]:
biblio_subjectUniqueTerms_v4.info()

In [None]:
# determine how many persons should be shown
top_r = 20
# create small df for displaying and plotting
biblio_subjectUniqueTerms_top = biblio_subjectUniqueTerms_v4.head(top_r).reset_index(drop=True).copy()

# plotting in a barh chart the top n terms
fig2, ax = plt.subplots(figsize=(20, 10))  # Create the figure object
ax = biblio_subjectUniqueTerms_top.groupby(['"a"'])['count_of_records'].sum().sort_values(ascending=True).tail(top_r).plot(kind='barh', figsize=(20, 10))
ax.set_title("Top terms in Biblio's 650$a field")
ax.set_xlabel("Number biblio records")
ax.set_ylabel("Term $a")

In [None]:
name_file = 'plot2_uniqueSubjectTerms650a--'
# Save the figure as PNG
fig2.savefig(f'{data_downloads_subjects}/{name_file}.png', format='png', dpi=300, bbox_inches='tight')


In [None]:
# export all list of unique subject terms with the number of occurrences in Biblio
biblio_subjectUniqueTerms_v1.rename(columns={'"a"': '650a', 'count_of_records': 'count_records_biblio', 'ids': 'biblio_record_ids'}, inplace=True)

name_file = 'unique_650a_with_counts_and_recordIds'

biblio_subjectUniqueTerms_v1.to_csv(f'{data_downloads_subjects}/{name_file}.csv', index=False) # if too big, use compression='gzip'

In [None]:
biblio_subjectUniqueTerms_v1.info()

In [None]:
biblio_subjectUniqueTerms = biblio_subjectUniqueTerms_v1.reset_index(drop=True).copy()

In [None]:
list_unique_terms_biblio = biblio_subjectUniqueTerms_v1['650a'].unique().tolist()

In [None]:
len(list_unique_terms_biblio)

In [None]:
biblio_subjectUniqueTerms.info()

# AUTHORITIES subject terms
- These terms are extracted from the IISH metadata using the public version of the OAI-PMH endpoint. For more information about what AUTHORITIES contains, see: https://confluence.socialhistoryservices.org/x/S4FeBw.
- The harvesting was done using the code from the "Metadata overviews" repository: https://github.com/lilimelgar/iisg-metadata-overviews
- The harvesting date was November 13th, 2024.
- Using another jupyter notebook (https://github.com/lilimelgar/iisg-metadata-overviews/blob/main/biblio/src/biblio_query.ipynb) I created a slice of the entire metadata selecting only the MARC fields that start with 6, because 600 corresponds to the group of subject terms in MARC (https://www.loc.gov/marc/bibliographic/bd6xx.html)

In [None]:
# read csv as dataframe
authorities_subjectTerms_df_v0 = pd.read_csv(f'{data_downloads_authority}/subject_terms_per_150.csv', sep=",", low_memory=False)
# low_memory=False was set after this warning message: "/var/folders/3y/xbjxw0b94jxg6x2bcbyjsmmcgvnf7q/T/ipykernel_987/2912965462.py:3: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False."

In [None]:
# get an overview of the data
authorities_subjectTerms_df_v0.info(verbose = True, show_counts = True)

In [None]:
# convert datatypes and fill in empty values
df_columns = authorities_subjectTerms_df_v0.columns
for column in df_columns:
    dataType = authorities_subjectTerms_df_v0.dtypes[column]
    if dataType == np.float64:
        authorities_subjectTerms_df_v0[column] = authorities_subjectTerms_df_v0[column].fillna('null')
        authorities_subjectTerms_df_v0[column] = authorities_subjectTerms_df_v0[column].astype(str)
    if dataType == np.int_:
        authorities_subjectTerms_df_v0[column] = authorities_subjectTerms_df_v0[column].fillna('null')
        authorities_subjectTerms_df_v0[column] = authorities_subjectTerms_df_v0[column].astype(str)
    if dataType == object:
        authorities_subjectTerms_df_v0[column] = authorities_subjectTerms_df_v0[column].fillna('null')
        authorities_subjectTerms_df_v0[column] = authorities_subjectTerms_df_v0[column].astype(str)

In [None]:
# convert id to string
authorities_subjectTerms_df_v0['001'] = authorities_subjectTerms_df_v0['001'].astype(str)
authorities_subj_df_v1 = authorities_subjectTerms_df_v0.copy()

In [None]:
# get an overview of the data
authorities_subj_df_v1.shape

In [None]:
# get an overview of the data
authorities_subj_df_v1.tail(10)

In [None]:
# get an overview of the data
authorities_subj_df_v1.describe()

In [None]:
query_test12 = authorities_subj_df_v1[authorities_subj_df_v1['150'].str.contains("⑄", case=False, regex=True)] #
query_test12

In [None]:
# TEMPORARILY DROP THE OUTLIER
authorities_sub_df_v2 = authorities_subj_df_v1.drop(300).copy()

In [None]:
authorities_sub_df_v2.shape

In [None]:
query_test13 = authorities_sub_df_v2[authorities_sub_df_v2['150'].str.contains('"a":', case=False, regex=True)] #
query_test13

In [None]:
query_test13.shape

In [None]:
authorities_sub_df_v2['150a'] = authorities_sub_df_v2['150'].map(lambda x: x.lstrip('"a":').rstrip(''))

In [None]:
# authorities_subj_df['150a'].unique()

In [None]:
list_unique_terms_authority = authorities_sub_df_v2['150a'].unique().tolist()

In [None]:
print(type(list_unique_terms_authority))

In [None]:
len(list_unique_terms_authority)

In [None]:
authorities_sub_df_v2.info()

In [None]:
for item in list_unique_terms_authority:
    print(item)

In [None]:
authorities_sub_df_v2.head()

In [None]:
authorities_sub_df = authorities_sub_df_v2.reset_index(drop=True)

In [None]:
authorities_sub_df.info(verbose=True)

# Comparing biblio unique terms with authorities unique terms

In [None]:
dfA = biblio_subjectUniqueTerms
dfB = authorities_sub_df

In [None]:
dfA.info()

In [None]:
dfB.info()

In [None]:
# This python script detects the string similarity between two lists of concepts/terms.

from tqdm import tqdm
from time import sleep
from fuzzywuzzy import fuzz

# Create an empty DataFrame to store matches
mapped_candidates = pd.DataFrame()

# Define score thresholds
rangeScoreHigh = 100  # Define explicitly
rangeScoreMid = 99     # Define if needed

# List to accumulate matches (faster than appending to DataFrame directly)
row_list = []  

############################## CAPTURE VARIABLES FROM DFs #######################################
for indexB, rowB in tqdm(dfB.iterrows(), total=dfB.shape[0]):
    # Capture basic standard columns for the mapping dataset B (to be mapped) as variables
    nameStringB = rowB['150a']  # Access row directly for efficiency
    sleep(0.01)  # Optional

    for indexA, rowA in dfA.iterrows():
        # Capture standard columns for dataset A
        nameStringA = rowA['650a']  # Access row directly

        ############################## SET STRING MATCHING SETTINGS #######################################
        matchScore1 = fuzz.token_sort_ratio(nameStringA, nameStringB)
        matchScore2 = fuzz.token_set_ratio(nameStringA, nameStringB)

        ############################## RUN STRING MATCHING #######################################
        if matchScore1 == rangeScoreHigh:
            row_list.append({
                'match_nameStringB': nameStringB,
                'nameStringA': nameStringA,
            })

# Convert list to DataFrame once (efficient)
if row_list:
    mapped_candidates = pd.DataFrame(row_list)



In [None]:
mapped_candidates.info()

# THESAURUS (poolparty)
Here I import an export from Poolparty with the concepts and their broader and narrower terms.
Here is the query I used:

```
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT ?Concept 
       (GROUP_CONCAT(DISTINCT ?Label; separator=", ") AS ?PreferredLabels)
       (GROUP_CONCAT(DISTINCT ?AltLabel; separator=", ") AS ?AlternativeLabels)
       (GROUP_CONCAT(DISTINCT CONCAT(STR(?Broader), " (", COALESCE(?BroaderLabel, "No Label"), ")"); separator=", ") AS ?BroaderConcepts)
       (GROUP_CONCAT(DISTINCT CONCAT(STR(?Narrower), " (", COALESCE(?NarrowerLabel, "No Label"), ")"); separator=", ") AS ?NarrowerConcepts)
WHERE {
  ?Concept a skos:Concept .
  OPTIONAL { ?Concept skos:prefLabel ?Label . }
  OPTIONAL { ?Concept skos:altLabel ?AltLabel . }
  
  OPTIONAL { 
    ?Concept skos:broader ?Broader .
    OPTIONAL { ?Broader skos:prefLabel ?BroaderLabel . }
  }
  
  OPTIONAL { 
    ?Concept skos:narrower ?Narrower .
    OPTIONAL { ?Narrower skos:prefLabel ?NarrowerLabel . }
  }
}
GROUP BY ?Concept
ORDER BY ?Concept

```

In [None]:
# read csv as dataframe
poolparty_df_v0 = pd.read_csv(f'{data_directory_subjects}/poolparty/iish-poolparty-query-result.tsv', sep="\t", low_memory=False)
# low_memory=False was set after this warning message: "/var/folders/3y/xbjxw0b94jxg6x2bcbyjsmmcgvnf7q/T/ipykernel_987/2912965462.py:3: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False."

In [None]:
poolparty_df_v0.info()

In [None]:
poolparty_df_v0.head(1)

In [None]:
# convert datatypes and fill in empty values
df_columns = poolparty_df_v0.columns
for column in df_columns:
    dataType = poolparty_df_v0.dtypes[column]
    if dataType == np.float64:
        poolparty_df_v0[column] = poolparty_df_v0[column].fillna('null')
        poolparty_df_v0[column] = poolparty_df_v0[column].astype(str)
    if dataType == np.int_:
        poolparty_df_v0[column] = poolparty_df_v0[column].fillna('null')
        poolparty_df_v0[column] = poolparty_df_v0[column].astype(str)
    if dataType == object:
        poolparty_df_v0[column] = poolparty_df_v0[column].fillna('null')
        poolparty_df_v0[column] = poolparty_df_v0[column].astype(str)

In [None]:
poolparty_df_v0.info()

In [None]:
poolparty_df_v1 = poolparty_df_v0.copy()

In [None]:
test10 = poolparty_df_v1[poolparty_df_v1['PreferredLabels'].str.contains("strike*|staking*", case=False, regex=True)]
test10

In [None]:
# split column "PreferredLabels" to get each label in one row
# Step 1: Split the column using "Ω" as the separator
poolparty_df_v1["PreferredLabels"] = poolparty_df_v1["PreferredLabels"].str.split(", ")

# Step 2: Explode the list into multiple rows
poolparty_df_v1 = poolparty_df_v1.explode("PreferredLabels", ignore_index=True)


In [None]:
poolparty_df_v1.info()

In [None]:
test11 = poolparty_df_v1[poolparty_df_v1['PreferredLabels'].str.contains("strike*|staking*", case=False, regex=True)]
test11

In [None]:
poolparty_df = poolparty_df_v1.reset_index(drop=True)

In [None]:
poolparty_df.info(verbose=True)

In [None]:
# # You may want to dowload the table above to an excel file for further inspection:
# name_file = 'poolparty_concepts_per_label' # for thesaurus report

# # field_subset_df.to_excel(f'{data_downloads}/{name_file}.xlsx')

# ## or download to csv
# poolparty_df.to_csv(f'{data_directory_subjects}/poolparty/{name_file}.csv', index=False) # if too big, use compression='gzip'

# MAPPINGS authority - Poolparty

ListA = authorities_sub_df
ListB = poolparty_df

<!-- Authorities -->
<!-- #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   001     982 non-null    object
 1   150     982 non-null    object
 2   450     982 non-null    object
 3   550     982 non-null    object
 4   leader  982 non-null    object
 5   150a    982 non-null    object -->


<!-- Poolparty -->
<!-- #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Concept            2213 non-null   object
 1   PreferredLabels    2213 non-null   object
 2   AlternativeLabels  2213 non-null   object
 3   BroaderConcepts    2213 non-null   object
 4   NarrowerConcepts   2213 non-null   object -->

In [None]:
# This python script detects the string similarity between two lists of concepts/terms.

from tqdm import tqdm
from time import sleep
from fuzzywuzzy import fuzz


def compare_strings(dfA, dfB):
    '''Processes and maps candidate names
    Inputs are two dataframes of names
    Outputs a dataframe of candidates
    '''

    # # create an empty dataframe
    # mapped_candidates_df = pd.DataFrame()
    ##COLLECT ROWS
    rows = []

    ############################## CAPTURE VARIABLES FROM DFs #######################################
    # for indexB, rowB in dfB.iterrows():
    for indexB, rowB in tqdm(dfB.iterrows(), total=dfB.shape[0]):
        # Capture basic standard columns for the mapping dataset B (to be mapped) as variables
        idB = dfB.loc[indexB, 'Concept']
        stringB = dfB.loc[indexB, 'PreferredLabels']
        alternativeLabelsB = dfB.loc[indexB, 'AlternativeLabels']
        broaderConceptsB = dfB.loc[indexB, 'BroaderConcepts']
        narrowerConceptsB = dfB.loc[indexB, 'NarrowerConcepts']
        sleep(0.01)
        for indexA, rowA in dfA.iterrows():
            # Capture basic standard columns for the mapping dataset A (to be mapped to) as variables
            idA = dfA.loc[indexA, '001']
            stringA = dfA.loc[indexA, '150a']
            marc150 = dfA.loc[indexA, '150']
            marc450 = dfA.loc[indexA, '450']
            marc550 = dfA.loc[indexA, '550']        
            leader = dfA.loc[indexA, 'leader']

    ############################## SET STRING MATCHING SETTINGS #######################################

            # Algorithm to be used
            matchScore1 = fuzz.token_sort_ratio(stringA.lower(), stringB.lower())
            # matchScore2 = fuzz.token_set_ratio(stringA, stringB)
            # matchScore3 = fuzz.partial_ratio(nameStringA, nameStringB) # USE WITH casesNoisy (edit below) if names in both datasets are very similar. It compares parts of strings, low score is useful to avoid matches like this (('Carlieri Jacopo', 'Jacopo Battieri'))

            # String score ranges
            rangeScoreVeryLow = 80
            rangeScoreLow = 85
            rangeScoreMid = 90
            rangeScoreHigh = 100


    # ############################## RUN STRING MATCHING #######################################
            # this rule only applies to cases of type A when the dates are exactly the same (e.g., to match 'Olivarius Vredius' with 'Olivier de Wree')
            if rangeScoreVeryLow <= matchScore1 <= rangeScoreHigh:
                scoreString = dfA.loc[indexA,'scoreString'] = matchScore1
                scoreType = dfA.loc[indexA, 'scoreType'] = 'matchScore1'
                match_StringB = dfA.loc[indexA,'match-stringB'] = stringB
                match_idB = idB = dfA.loc[indexA,'match-idB'] = idB
                rows.append({
                    'scoreString': scoreString,
                    'scoreType': scoreType,
                    'idA':idA,                    
                    'match_idB': match_idB,
                    'stringA': stringA,                    
                    'match_stringB': match_StringB,
                    'marc150': marc150, 
                    'marc450': marc450, 
                    'marc550': marc550, 
                    'leader': leader,
                    'alternativeLabelsB': alternativeLabelsB, 
                    'broaderConceptsB': broaderConceptsB, 
                    'narrowerConceptsB': narrowerConceptsB
                })

        df_mapped = pd.DataFrame(rows)

    return df_mapped


In [None]:
# dfA = authorities_sub_df
# dfB = poolparty_df

mapped_candidates = compare_strings(authorities_sub_df, poolparty_df)

In [None]:
mapped_candidates.head()

In [None]:
mapped_candidates.info(verbose=True)

In [None]:
test12 = mapped_candidates[mapped_candidates['StringA'].str.contains("strike*|staking*", case=False, regex=True)]
test12

In [None]:
# NOT USED

In [None]:
# # how many terms are in Biblio that have a correspondent Authority record?
# biblio_subjectTerms_notnull = biblio_subjectTerms_df[biblio_subjectTerms_df['650'] == 'notnull']
# question2 = biblio_subjectTerms_notnull[biblio_subjectTerms_notnull['"0"'].str.contains("NL-AMISG)", case=True, regex=False)]
# question2_total = question2.shape[0]
# question2_total

In [None]:
# # how many terms are in Biblio that do not have a correspondent Authority record?
# biblio_subjectTerms_a_notnull = biblio_subjectTerms_df[biblio_subjectTerms_df['"a"'] is not ]
# question3 = biblio_subjectTerms_notnull[~biblio_subjectTerms_notnull['"0"'].str.contains("", case=True, regex=False)]
# question3_total = biblio_subjectTerms_null.shape[0]
# question3_total

In [None]:
# questionTest = biblio_subjectTerms_df['"0"'].value_counts()
# questionTest

In [None]:
# query_test = subject_terms_df[
#     subject_terms_df['"a"'].str.contains("collective", case=False, regex=True) & 
#     subject_terms_df["another_column"].str.contains("stringTest", case=False, regex=True)
# ]

In [None]:
# subject_terms_df['"x"'].unique()
# query_test = subject_terms_df[subject_terms_df['"a"'].str.contains("collective", case=False, regex=True)] #
# query_test
# query_test['"a"'].unique()