In [2]:
import sys
if "../" not in sys.path:
    sys.path.append("../")

import pickle
import re

import pandas as pd
from pymarc import Record
from numpy import nan

### Process raw data into query terms

In [None]:
raw_music_df = pd.read_csv("..\\data\\raw\\VOC_results.csv", index_col=0, dtype={4:str, 9:str, 10:str, 11:str})

In [None]:
raw_music_df.shape

In [None]:
raw_music_df.dtypes

In [None]:
music_df = raw_music_df.drop(
    columns=["300 a", "650", "650.1", "650.2", "650.3", "650.4", "650.5", "852 ab", "852 j", "852 j.1"]
).copy().rename(columns={"100": "100_raw", "245": "245_raw", "260 abc":"260", "264 abc": "264"})
music_df["260_raw"] = music_df["260"].replace(nan, "") + music_df["264"].replace(nan, "")
music_df.drop(columns=["260", "264"], inplace=True)

In [None]:
music_df["100"] = music_df["100_raw"].str.split(",$", regex=False, expand=True, n=1)[0].str.lstrip("$a").str.rstrip(",")
music_df["100"]

In [None]:
music_df["245_raw"].str.split(re.compile(r":\$"), regex=True, expand=True)

In [None]:
music_df["245"] = music_df["245_raw"].str.split(r"[,/;=:\.\!\?]\$", regex=True, expand=True, n=1)[0].str.lstrip("$a")
music_df["245"]

In [None]:
music_df["245"][music_df["245"].str.contains("$", regex=False)]

In [None]:
def find_date(date, pattern):
    date = pattern.search(date)
    if date:
        return date.group()
    else:
        return nan

music_df["260"] = music_df["260_raw"].apply(lambda x: find_date(x, re.compile(r"(?<=\$cc)19[0-9]{2,2}")))

In [None]:
music_df.shape

In [None]:
weights = (music_df["260"].value_counts()/music_df["260"].value_counts().sum()).rename("weights_col")
music_df["weights"] = music_df.merge(weights, on="260", how="left")["weights_col"].set_axis(music_df.index, axis=0)  # reproducible operation

In [None]:
music_df

In [None]:
# music_df.to_csv("..\\data\\processed\\music_records.csv")

### Sampling and verifying integrity of year distribution

In [None]:
complete_music_df = pd.read_csv("..\\data\\processed\\music_records.csv", index_col=0, dtype={"260": str})
music_df = complete_music_df.sample(n=10000, weights="weights", axis=0, random_state=1234)

In [None]:
music_df["260"].astype(float).hist(bins=15).set_xlim(1900, 1990)

In [None]:
music_df["260"].astype(float).hist(bins=20)

In [None]:
music_df["260 abc"].value_counts().iloc[50:100]

In [None]:
music_df["300 a"].value_counts()

In [None]:
# https://www.compart.com/en/unicode/U+FF04  Full width dollar symbol
# music_df.apply(lambda x: x.str.replace("$", "＄"), axis=1).head()

### Examine results

In [63]:
oclc_music_df = pickle.load(open("..\\data\\processed\\10k_music_records.p", "rb"))

In [64]:
oclc_music_df

Unnamed: 0_level_0,100_raw,245_raw,260_raw,100,245,260,weights,brief_bibs,worldcat_matches
001,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5078115,"$aLillenas, Haldor.",$aGod save America.,$cc1945.,"Lillenas, Haldor.",God save America.,1945,0.014506,"{'numberOfRecords': 1, 'briefRecords': [{'oclc...",[]
5160161,"$aDandurand, J. L.",$aWhy didn't you leave me go :$bWhy are you so...,$cc1922.,"Dandurand, J. L.",Why didn't you leave me go,1922,0.021366,"{'numberOfRecords': 2, 'briefRecords': [{'oclc...",[]
5124972,"$aDelettre, Jean.","$aParle O moi d'autre chose, There's no more y...",$cc1934.,"Delettre, Jean.","Parle O moi d'autre chose, There's no more you...",1934,0.018400,{'numberOfRecords': 0},[]
5196595,"$aSquires, Harry D.",$aI don't want to be left all alone.,$cc1953.,"Squires, Harry D.",I don't want to be left all alone.,1953,0.019387,"{'numberOfRecords': 1, 'briefRecords': [{'oclc...",[]
5195339,"$aCuriel, Federico,$d-1985.",$aHoy me quiero emborrachar :$bCancion Ranchera.,$cc1950.,"Curiel, Federico",Hoy me quiero emborrachar,1950,0.015361,"{'numberOfRecords': 1, 'briefRecords': [{'oclc...",[]
...,...,...,...,...,...,...,...,...,...
5224059,"$aCharleu, Lulu.",$aParis-Morvan /$cLulu Charleu ; Guy Bertret.,$cc1964.,"Charleu, Lulu.",Paris-Morvan,1964,0.020662,"{'numberOfRecords': 2, 'briefRecords': [{'oclc...",[]
5175069,"$aNacho, Tata,$d1894-1968.",$aCapullito de Rosa.,$cc1964.,"Nacho, Tata",Capullito de Rosa.,1964,0.020662,"{'numberOfRecords': 1, 'briefRecords': [{'oclc...",[]
5064159,"$aDominguez, Armando.",$aDestino :$bCancion - Bolero.,$cc1945.,"Dominguez, Armando.",Destino,1945,0.014506,"{'numberOfRecords': 1, 'briefRecords': [{'oclc...",[]
5067035,"$aRoma, Caro,$d1866-1937.",$aAngel-cake.,$cc1928.,"Roma, Caro",Angel-cake.,1928,0.020818,"{'numberOfRecords': 1, 'briefRecords': [{'oclc...",[]


In [69]:
# check no errors during API calls
def find_err(cell):
    if type(cell) == str:
        return True
    else:
        return False

oclc_music_df[oclc_music_df["brief_bibs"].apply(lambda x: find_err(x))]

Unnamed: 0_level_0,100_raw,245_raw,260_raw,100,245,260,weights,brief_bibs,worldcat_matches
001,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [89]:
def remove_ukmgb(brief_bib):
    if not brief_bib or brief_bib['numberOfRecords'] == 0:
        return brief_bib
    
    else:
        no_ukmgb = [r for r in brief_bib['briefRecords'] if r['catalogingInfo']['catalogingAgency'] != 'UKMGB']
        clean_brief_bib = {
            'numberOfRecords': len(no_ukmgb),
            'briefRecords': no_ukmgb
        }
        
    return clean_brief_bib

def find_cat_agency(brief_bib):
    if not brief_bib or brief_bib['numberOfRecords'] == 0:
        return []
    else:
        cat_agencies = [r['catalogingInfo']['catalogingAgency'] for r in brief_bib['briefRecords']]
        
    return cat_agencies

In [95]:
cat_agencies = oclc_music_df["brief_bibs_no_ukmgb"].apply(lambda x: find_cat_agency(x))

In [96]:
pd.Series(cat_agencies.sum()).value_counts().head(20)

INT      1601
AU@       758
BDF       313
UBY       304
ERD       265
DKDLA     194
MCBBM     166
JRK       136
YOU       128
S3O       120
AZS       104
BGU       104
EYM        88
CLE        85
DRB        83
S#L        80
CGP        71
IYU        68
NZ1        67
NLC        65
Name: count, dtype: int64

In [97]:
oclc_music_df["brief_bibs_no_ukmgb"]

001
5078115           {'numberOfRecords': 0, 'briefRecords': []}
5160161           {'numberOfRecords': 0, 'briefRecords': []}
5124972                               {'numberOfRecords': 0}
5196595           {'numberOfRecords': 0, 'briefRecords': []}
5195339           {'numberOfRecords': 0, 'briefRecords': []}
                                 ...                        
5224059    {'numberOfRecords': 1, 'briefRecords': [{'oclc...
5175069           {'numberOfRecords': 0, 'briefRecords': []}
5064159           {'numberOfRecords': 0, 'briefRecords': []}
5067035           {'numberOfRecords': 0, 'briefRecords': []}
5145985           {'numberOfRecords': 0, 'briefRecords': []}
Name: brief_bibs_no_ukmgb, Length: 10000, dtype: object

In [71]:
oclc_music_df["brief_bibs_no_ukmgb"] = oclc_music_df["brief_bibs"].apply(lambda x: remove_ukmgb(x))

In [81]:
oclc_music_df["n_records"] = oclc_music_df["brief_bibs"].apply(lambda x: x["numberOfRecords"])

In [82]:
oclc_music_df["n_records_no_ukmgb"] = oclc_music_df["brief_bibs_no_ukmgb"].apply(lambda x: x["numberOfRecords"])

In [85]:
oclc_music_df[oclc_music_df["n_records"].astype(bool)].describe()

Unnamed: 0,weights,n_records,n_records_no_ukmgb
count,8827.0,8827.0,8827.0
mean,0.018993,1.866093,0.77025
std,0.003683,2.95425,2.46673
min,0.000469,1.0,0.0
25%,0.017696,1.0,0.0
50%,0.019387,1.0,0.0
75%,0.021366,2.0,0.0
max,0.025196,91.0,49.0


In [84]:
oclc_music_df[oclc_music_df["n_records_no_ukmgb"].astype(bool)].describe()

Unnamed: 0,weights,n_records,n_records_no_ukmgb
count,2132.0,2132.0,2132.0
mean,0.018643,4.38227,3.189024
std,0.003472,5.197979,4.181411
min,0.001085,1.0,1.0
25%,0.016387,2.0,1.0
50%,0.019162,3.0,2.0
75%,0.020638,5.0,4.0
max,0.025196,91.0,49.0


In [102]:
oclc_music_df["n_records_no_ukmgb"].sum()

6799

In [100]:
oclc_music_df[oclc_music_df["n_records_no_ukmgb"].astype(bool)]["brief_bibs_no_ukmgb"].loc[5093571]

{'numberOfRecords': 3,
 'briefRecords': [{'oclcNumber': '52497021',
   'title': 'In bluebird land',
   'creator': 'Albert E. Short',
   'date': '©1921',
   'machineReadableDate': '1921',
   'language': 'eng',
   'generalFormat': 'MsScr',
   'publisher': 'Will Rossiter',
   'publicationPlace': 'Chicago',
   'mergedOclcNumbers': ['861297690'],
   'catalogingInfo': {'catalogingAgency': 'MFM',
    'catalogingLanguage': 'eng',
    'levelOfCataloging': ' ',
    'transcribingAgency': 'MFM'}},
  {'oclcNumber': '801653635',
   'title': 'In bluebird land : song',
   'creator': 'Albert E. Short',
   'date': '1921',
   'machineReadableDate': '1921',
   'language': 'eng',
   'generalFormat': 'MsScr',
   'publisher': 'Will Rossiter',
   'publicationPlace': 'Chicago, ILL',
   'catalogingInfo': {'catalogingAgency': 'YOU',
    'catalogingLanguage': 'eng',
    'levelOfCataloging': 'M',
    'transcribingAgency': 'YOU'}},
  {'oclcNumber': '43030468',
   'title': 'In bluebird land : fox trot',
   'creator'