# clean and prepare Library of Congress book data
- Data from the libary of congress: https://www.loc.gov/cds/products/MDSConnect-books_all.html

## setup

In [1]:
from pyspark.sql import SQLContext
#import pandas as pd
from pyspark.sql.functions import *
import pyspark.sql
import string
sqlC = SQLContext(sc)
print('done')

done


## convert data from xml to parquet

In [2]:
loc = sqlC.read.format("com.databricks.spark.xml"
                        ).option("rowTag", "record"
                                ).option("samplingRatio", 0.01
                                        ).load("/user/jwlock/LoC/Books*")
loc.count()

KeyboardInterrupt: 

In [None]:
loc.write.option("maxRecordsPerFile", 10000
                  ).parquet("loc_raw.parquet", 
                            mode='overwrite')
print("done!")

In [3]:
loc = sqlC.read.parquet('loc_raw.parquet')

In [4]:
loc.printSchema()

root
 |-- controlfield: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _tag: long (nullable = true)
 |-- datafield: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _ind1: string (nullable = true)
 |    |    |-- _ind2: string (nullable = true)
 |    |    |-- _tag: long (nullable = true)
 |    |    |-- subfield: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |-- _code: string (nullable = true)
 |-- leader: string (nullable = true)



## create an ID column for the books

In [5]:
loc = loc.withColumn("id", monotonically_increasing_id())

In [6]:
loc.show()

+--------------------+--------------------+--------------------+---+
|        controlfield|           datafield|              leader| id|
+--------------------+--------------------+--------------------+---+
|[[  2010657549,1]...|[[ , ,10,WrappedA...|01627cam a2200397...|  0|
|[[  2010657550,1]...|[[ , ,10,WrappedA...|01934cam a2200421...|  1|
|[[  2010657551,1]...|[[ , ,10,WrappedA...|01790cam a2200373...|  2|
|[[  2010657552,1]...|[[ , ,10,WrappedA...|01564cam a2200361...|  3|
|[[  2010657553,1]...|[[ , ,10,WrappedA...|01396cam a2200373...|  4|
|[[  2010657554,1]...|[[ , ,10,WrappedA...|01731cam a2200421...|  5|
|[[  2010657555,1]...|[[ , ,10,WrappedA...|02519cam a2200517...|  6|
|[[  2010657556,1]...|[[ , ,10,WrappedA...|06922cam a2201513...|  7|
|[[  2010657557,1]...|[[ , ,10,WrappedA...|01926cam a2200445...|  8|
|[[  2010657558,1]...|[[ , ,10,WrappedA...|01255cam a2200349...|  9|
|[[  2010657559,1]...|[[ , ,10,WrappedA...|01420cam a2200349...| 10|
|[[  2010657560,1]...|[[ , ,10,Wra

## Extract data from LoC tags
- LoC uses different `_tag` values to mean different information about records. This section takes that and saves it to variables with meaninfgul names
- The relevant tag documentation is included at the bottom of this notebook and can be found at the data link at the top as well. 

In [7]:
tmp = loc.select('id', explode('datafield').alias('d')).select('id', 'd.*')#.show()
tmp.show()

+---+-----+-----+----+--------------------+
| id|_ind1|_ind2|_tag|            subfield|
+---+-----+-----+----+--------------------+
|  0|     |     |  10|  [[  2010657549,a]]|
|  0|     |     |  35|[[(CStRLIN)VAUP90...|
|  0|     |     |  40|[[ViU,a], [ViU,c]...|
|  0|    0|     |  41|           [[ger,a]]|
|  0|    0|    0|  50|[[ML48,a], [[S290...|
|  0|     |     |  51|[[Microfilm Music...|
|  0|    0|    0| 245|[[Zur Heimkehr :,...|
|  0|     |     | 260|[[Berlin :,a], [[...|
|  0|     |     | 300|[[[2], 20, [6] p....|
|  0|     |     | 500|[[U.S. RISM Libre...|
|  0|     |     | 500|[[At head of titl...|
|  0|     |     | 500|[[Singers' names ...|
|  0|     |    0| 650|[[Dramatic music,...|
|  0|     |     | 653| [[Gerechtigkeit,a]]|
|  0|     |     | 653|        [[Friede,a]]|
|  0|     |     | 653|        [[Morgen,a]]|
|  0|     |     | 653|        [[Mittag,a]]|
|  0|     |     | 653|         [[Abend,a]]|
|  0|     |     | 653|  [[Jahreszeiten,a]]|
|  0|     |     | 653|       [[K

In [8]:
t2 = tmp.filter(col('_tag') == 245
                  ).select('id', explode('subfield').alias('s')
                          ).select('id', 's.*')

title = t2.filter(col('_code') == 'a').select('id', col('_value').alias('title'))
subtitle = t2.filter(col('_code') == 'b').select('id', col('_value').alias('subtitle'))

title.take(20)

[Row(id=0, title=u'Zur Heimkehr :'),
 Row(id=1, title=u'Ein Soldat :'),
 Row(id=2, title=u'Eine neue Station, oder, Der Herr Eisenbahn-Director :'),
 Row(id=3, title=u'Sein Schatten :'),
 Row(id=4, title=u'Judith :'),
 Row(id=5, title=u'Un curioso accidente :'),
 Row(id=6, title=u'Gian Maria Visconti, duca di Milano :'),
 Row(id=7, title=u'Ai\u0308da :'),
 Row(id=8, title=u"Cuore di madre, ossia, La contessa d'Altenberg :"),
 Row(id=9, title=u'Giuditta :'),
 Row(id=10, title=u'Ezechia :'),
 Row(id=11, title=u'Una burla :'),
 Row(id=12, title=u'Il conte di Monreal :'),
 Row(id=13, title=u"L'avvocato Patelin :"),
 Row(id=14, title=u"Il trionfo d'amore :"),
 Row(id=15, title=u'Il califfo :'),
 Row(id=16, title=u"Linda d'Ispahan :"),
 Row(id=17, title=u'Le astuzie femminili :'),
 Row(id=18, title=u'Il figliuol prodigo :'),
 Row(id=19, title=u'I due mariti :')]

In [9]:
t2 = tmp.filter(col('_tag') == 520
                  ).select('id', explode('subfield').alias('s')
                          ).select('id', 's.*')

summary = t2.filter(col('_code') == 'a').select('id', 
                                                col('_value').alias('summary'))
summary_expanded = t2.filter(col('_code') == 'b'
                            ).select('id', 
                                     col('_value').alias('summary_expanded'))

summary.show()

+----+--------------------+
|  id|             summary|
+----+--------------------+
|9585|Typescript includ...|
|9610|National Symphony...|
|9611|National Symphony...|
|9612|National Symphony...|
|9613|National Symphony...|
|9614|National Symphony...|
|9615|National Symphony...|
|9616|National Symphony...|
|9617|National Symphony...|
|9618|National Symphony...|
|9619|National Symphony...|
|9620|National Symphony...|
|9621|National Symphony...|
|9622|National Symphony...|
|9623|National Symphony...|
|9624|National Symphony...|
|9625|National Symphony...|
|9626|National Symphony...|
|9627|National Symphony...|
|9628|National Symphony...|
+----+--------------------+
only showing top 20 rows



In [10]:
t2 = tmp.filter(col('_tag') == 505
                  ).select('id', explode('subfield').alias('s')
                          ).select('id', 's.*')

contents = t2.filter(col('_code') == 'a').select('id', 
                                                col('_value').alias('contents'))

contents.take(5)

[Row(id=218, contents=u'Vorbemerkung, p. 3-4 -- Wolfgang Amadeus Mozart als Freimaurer, p. 5-10 -- Die Zauberflo\u0308te (essay), p. 11-29 -- Die Zauberflo\u0308te (libretto), p. 31-64.'),
 Row(id=1259, contents=u'I. Der Freischu\u0308tz. Volks-Oper, p. 1-62 -- II. scho\u0308pfungsgeschichte des Freischu\u0308tzen. Biographische Novelle, p. 63-138 -- III. Briefe, p. 139-176 -- IV. Gedichte, p. 177-210 -- V. Erla\u0308uterungen. (Aus Sprache und Geschichte), p. 211-242 -- VI. Miscellen, p. 243-272.'),
 Row(id=3160, contents=u'Dem Unendlichen p. 1-4 -- Vertrauen auf Gott, p. 5-11.'),
 Row(id=3963, contents=u'Erstes Bild. Die Arretirung -- Zweites Bild. Die tropische Taufe -- Drittes Bild. Die Favorit-Sultanin -- Viertes Bild. Der Kaiser von Japan.'),
 Row(id=5655, contents=u'1. Le sventure fortunate, p. 1-32 -- 2. La finta zingara, p. 33-71.')]

In [11]:
t2 = tmp.filter(col('_tag') == 260
                  ).select('id', explode('subfield').alias('s')
                          ).select('id', 's.*')

pubdate = t2.filter(col('_code') == 'c').select('id', 
                                                col('_value').alias('pubdate'))

pubdate.show()

+---+-------+
| id|pubdate|
+---+-------+
|  0|  1871.|
|  1|  1871.|
|  2|  1871.|
|  3|  1871.|
|  4|  1871.|
|  5|[1871?]|
|  6|  1871.|
|  7|  1871.|
|  8|  1871.|
|  9| [1871]|
| 10|  1871.|
| 11|  1871.|
| 12|[1871?]|
| 13|  1871.|
| 14|  1871.|
| 15|  1871.|
| 16|  1871.|
| 17|  1871.|
| 18|  1871.|
| 19|[1871?]|
+---+-------+
only showing top 20 rows



In [12]:
t2 = tmp.filter(col('_tag') == 100
                  ).select('id', explode('subfield').alias('s')
                          ).select('id', 's.*')

main_author = t2.filter(col('_code') == 'a').select('id', 
                                                col('_value').alias('main_author'))

main_author.show()

+---+--------------------+
| id|         main_author|
+---+--------------------+
|  1|       Bial, Rudolf,|
|  2|       Bial, Rudolf,|
|  3|Flotow, Friedrich...|
|  4|     Doppler, Franz,|
|  5| Ricci-Stolz, Luigi,|
|  6|      Vicini, Luigi,|
|  7|    Verdi, Giuseppe,|
|  8|Rossi, Giovanni G...|
|  9|   Righi, Telesforo,|
| 10|    Picchi, Ermanno,|
| 11| Parisini, Federico,|
| 12|Gandolfi, Riccard...|
| 13|  Montuoro, Achille.|
| 14| Matteini, Raffaele.|
| 15|  De Champs, Ettore,|
| 16|Malipiero, France...|
| 17| Cimarosa, Domenico,|
| 18|     Auber, D. F. E.|
| 19|  D'Arienzo, Nicola,|
| 20|    Angeloni, Carlo,|
+---+--------------------+
only showing top 20 rows



In [13]:
t2 = tmp.filter(col('_tag') == 700
                  ).select('id', explode('subfield').alias('s')
                          ).select('id', 's.*')

other_author = t2.filter(col('_code') == 'a').select('id', 
                                                col('_value').alias('other_author'))

other_author = other_author.groupby('id').agg(collect_list('other_author').alias('other_author'))

other_author.take(20)

[Row(id=9945, other_author=[u'Hart, Jillian.', u'Davidson, Carolyn,', u'Bridges, Kate,']),
 Row(id=10422, other_author=[u'Seitanidi, Maria May.', u'Crane, Andrew,']),
 Row(id=11190, other_author=[u'Kiwan, Dina,']),
 Row(id=11434, other_author=[u'Hinrichs-Rahlwes, Rainer.']),
 Row(id=11745, other_author=[u'Thompson, Andrew R.', u'Jenkinson, Elizabeth', u'Rumsey, Nichola.', u'Newell, Robert,']),
 Row(id=11945, other_author=[u'Eldridge, Elleanor,', u'Moody, Joycelyn,']),
 Row(id=12044, other_author=[u'Dunn, Jim.', u'Strong, Ron.']),
 Row(id=13098, other_author=[u'Miyazaki, Hayao,', u'Hubbert, Jim,']),
 Row(id=13638, other_author=[u'Smith, Robin James,', u'Hetherington, Kevin,']),
 Row(id=14846, other_author=[u'Stepan, Alfred C.']),
 Row(id=15057, other_author=[u'Malenczyk, Rita,']),
 Row(id=15322, other_author=[u'Bruce, Bertram C.,', u'Bishop, Ann P.,', u'Budhathoki, Nama R.,']),
 Row(id=15371, other_author=[u'Cohn, Stephen M.,', u'Dolich, Matthew,']),
 Row(id=17048, other_author=[u'Wiesn

##  Select ISBN and ISSN data

In [14]:
t2 = tmp.filter(col('_tag') == 20
                  ).select('id', explode('subfield').alias('s')
                          ).select('id', 's.*')

isbn = t2.filter(col('_code') == 'a').select('id', 
                                                col('_value').alias('isbn'))

#isbn = isbn.groupby('id').agg(concat_ws(', ', collect_list('isbn')).alias('ISBNs'))
#isbn = isbn.withColumn('isbn', regexp_replace('isbn', '\D', ''))
#isbn = isbn.withColumn('isbn', regexp_replace('isbn', '-', ''))
isbn = isbn.withColumn('isbn', regexp_replace('isbn', ':', ''))
isbn = isbn.withColumn('isbn', regexp_replace('isbn', '\(.+\)', ''))
isbn = isbn.withColumn('isbn', regexp_replace('isbn', ' ', ''))

isbn.sample(False, 0.01).take(30)

[Row(id=9758, isbn=u'1420104578'),
 Row(id=9843, isbn=u'9780756404932'),
 Row(id=9867, isbn=u'9780373655267'),
 Row(id=9978, isbn=u'9780373618279'),
 Row(id=9994, isbn=u'9780373694587'),
 Row(id=10055, isbn=u'9781624910128'),
 Row(id=10105, isbn=u'0071790233'),
 Row(id=10455, isbn=u'9780819816382'),
 Row(id=10484, isbn=u'9781611861167'),
 Row(id=10537, isbn=u'9781617039003'),
 Row(id=10635, isbn=u'9780345539786'),
 Row(id=10655, isbn=u'9780415837088'),
 Row(id=10694, isbn=u'9780802410795'),
 Row(id=10726, isbn=u'9780891123927'),
 Row(id=10744, isbn=u'9781284026900'),
 Row(id=10824, isbn=u'9780345535825'),
 Row(id=10868, isbn=u'9781466643338'),
 Row(id=10903, isbn=u'9780749469665'),
 Row(id=11004, isbn=u'9781618510495'),
 Row(id=11053, isbn=u'1421412381'),
 Row(id=11185, isbn=u'9780230361805'),
 Row(id=11273, isbn=u'9780062211316'),
 Row(id=11519, isbn=u'9781477762394'),
 Row(id=11590, isbn=u'9780544120044'),
 Row(id=11595, isbn=u'9781555717315'),
 Row(id=11604, isbn=u'9781137007155'),


In [18]:
t2 = tmp.filter(col('_tag') == 22
                  ).select('id', explode('subfield').alias('s')
                          ).select('id', 's.*')

issn = t2.filter(col('_code') == 'a').select('id', 
                                                col('_value').alias('issn'))

issn = issn.withColumn('issn', regexp_replace('issn', ':', ''))
issn = issn.withColumn('issn', regexp_replace('issn', '\(.+\)', ''))
issn = issn.withColumn('issn', regexp_replace('issn', ' ', ''))

issn.show(20)

+------------+-------------+
|          id|         issn|
+------------+-------------+
|  8589941913|    1059-1133|
| 34359748114|    2211-8101|
| 42949695364|    2192-855X|
| 51539640426|    0081-6914|
| 68719497479|   1449701130|
| 68719498105|9783790825930|
| 77309422617|    1559-5374|
| 85899370405|    1934-8525|
| 94489281980|    0169-9156|
| 94489283415|    0769-2633|
| 94489303015|     15709310|
| 94489309950|    1815-3712|
|103079215920|    0554-8128|
|103079215997|    0554-8128|
|103079223063|    1396-1810|
|103079245183|    0392-4866|
|103079249425|    1310-8247|
|103079249648|    1313-8138|
|111669184984|     16192435|
|120259086556|     18362060|
+------------+-------------+
only showing top 20 rows



In [19]:
t2 = tmp.filter(col('_tag') == 650
                  ).select('id', explode('subfield').alias('s')
                          ).select('id', 's.*')

subjects = t2.filter(col('_code') == 'a').select('id', 
                                                col('_value').alias('subject'))

#isbn = isbn.groupby('id').agg(concat_ws(', ', collect_list('isbn')).alias('ISBNs'))

subjects.show(20)

+---+--------------+
| id|       subject|
+---+--------------+
|  0|Dramatic music|
|  1|        Operas|
|  2|Dramatic music|
|  3|        Operas|
|  4|        Operas|
|  5|        Operas|
|  6|        Operas|
|  7|        Operas|
|  8|        Operas|
|  9|        Operas|
| 10|     Oratorios|
| 11|        Operas|
| 12|        Operas|
| 13|        Operas|
| 14|        Operas|
| 15|        Operas|
| 16|        Operas|
| 17|        Operas|
| 18|        Operas|
| 19|        Operas|
+---+--------------+
only showing top 20 rows



## Merge all results together

In [26]:
together = title.join(subtitle, on='id', how='outer')
together = together.join(isbn, on='id', how='outer')
together = together.join(issn, on='id', how='outer')
together = together.join(pubdate, on='id', how='outer')
together = together.join(summary, on='id', how='outer')
together = together.join(summary_expanded, on='id', how='outer')
together = together.join(subjects, on='id', how='outer')
together = together.join(contents, on='id', how='outer')
together = together.join(main_author, on='id', how='outer')
together = together.join(other_author, on='id', how='outer')

together.printSchema()

root
 |-- id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- subtitle: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- issn: string (nullable = true)
 |-- pubdate: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- summary_expanded: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- contents: string (nullable = true)
 |-- main_author: string (nullable = true)
 |-- other_author: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [27]:
spaces = ""
for i in string.punctuation:
    spaces += " "
    
together = together.withColumn('loc_bare_text', 
                         translate(lower(concat_ws(" ", 
                                                   together.title, 
                                                   together.subtitle, 
                                                   together.contents,
                                                   together.summary,
                                                   together.summary_expanded, 
                                                   together.subject,
                                                  )),
                                   string.punctuation, 
                                   spaces
                                  ))

## clean up text data 
- For ease of matching, we remove characters that are inconsistently used such as colons and periods in book titles. 

In [28]:
cols = ['title', 'subtitle', 'main_author', 'pubdate']

remove = '.:/'

for c in cols:
    together = together.withColumn(c, translate(c, remove, spaces))
    together = together.withColumn(c, regexp_replace(col(c), ',$', ''))
    together = together.withColumn(c, lower(trim(col(c))))
    
#together.sample(False, 0.01).select('title', 'subtitle', 'main_author', 'pubdate').take(10)

In [29]:
together.cache()

DataFrame[id: bigint, title: string, subtitle: string, isbn: string, issn: string, pubdate: string, summary: string, summary_expanded: string, subject: string, contents: string, main_author: string, other_author: array<string>, loc_bare_text: string]

In [30]:
#together.count()

## more sophisticated duplicate dropping, because some data are inconsistent

In [31]:
together = together.dropDuplicates(subset=['isbn', 'id', 'issn'])
together = together.dropDuplicates(subset=['isbn', 'main_author'])
together = together.dropDuplicates(subset=['isbn', 'title'])
#together.count()

## summarize

In [32]:
together.groupby('isbn').count().sort(desc('count')).take(20)

[Row(isbn=None, count=1444620),
 Row(isbn=u'8570602332', count=66),
 Row(isbn=u'0815199031', count=62),
 Row(isbn=u'9789989550256', count=59),
 Row(isbn=u'9989550255', count=54),
 Row(isbn=u'7222038752', count=51),
 Row(isbn=u'0534274943', count=48),
 Row(isbn=u'7543451301', count=47),
 Row(isbn=u'9025606385', count=45),
 Row(isbn=u'9787811185133', count=44),
 Row(isbn=u'781118513X', count=43),
 Row(isbn=u'0815199155', count=42),
 Row(isbn=u'2252016906', count=42),
 Row(isbn=u'7805233578', count=39),
 Row(isbn=u'8930080014', count=38),
 Row(isbn=u'081519918X', count=36),
 Row(isbn=u'0415203929', count=35),
 Row(isbn=u'8970131450', count=34),
 Row(isbn=u'7805233586', count=32),
 Row(isbn=u'8476790228', count=32)]

In [34]:
together.groupby('issn').count().sort(desc('count')).take(20)

[Row(issn=None, count=8359301),
 Row(issn=u'1062-4007', count=112),
 Row(issn=u'1099-7326', count=37),
 Row(issn=u'1443-4911', count=10),
 Row(issn=u'1530-1028', count=9),
 Row(issn=u'0187-425X', count=8),
 Row(issn=u'1531-1627', count=7),
 Row(issn=u'9788886509855', count=6),
 Row(issn=u'9788822230980', count=6),
 Row(issn=u'0272-9172', count=5),
 Row(issn=u'9788831194358', count=5),
 Row(issn=u'1326-6004', count=5),
 Row(issn=u'2211-3061', count=4),
 Row(issn=u'1087-4852', count=4),
 Row(issn=u'2211-8101', count=4),
 Row(issn=u'2213-5421', count=4),
 Row(issn=u'1396-1810', count=4),
 Row(issn=u'1874-0294', count=4),
 Row(issn=u'1059-1133', count=4),
 Row(issn=u'0340-1022', count=4)]

## save

In [33]:
together.write.option("maxRecordsPerFile", 10000
                  ).parquet("loc_books.parquet", 
                            mode='overwrite')
print("done!")

done!


## LoC tag documentation below

245 a is title
245 b is subtitle

650 are subjects
300 is physical description
050 LoC call number
260 publisher / edition info
040 catalogue source
020 ISBNs
022 ISSNs
043 geography
035 sys control num
042 auth code
082 dewey decimal
500 general note
504 bibliographic note

260 - PUBLICATION, DISTRIBUTION, ETC. (IMPRINT) (R)
   Indicators
      First - Sequence of publishing statements
         # - Not applicable/No information provided/Earliest available publisher
         2 - Intervening publisher
         3 - Current/latest publisher
      Second - Undefined
         # - Undefined
      First - Presence of publisher in imprint (BK MP MU SE) [OBSOLETE]
         0 - Publisher, distributor, etc. is present
         1 - Publisher, distributor, etc. is not present
      Second - Added entry/publisher relationship (SE) [OBSOLETE]
         0 - Publisher, distributor, etc. not same as issuing body in added entry
         1 - Publisher, distributor, etc. same as issuing body in added entry
   Subfield Codes
      $a - Place of publication, distribution, etc. (R)
      $b - Name of publisher, distributor, etc. (R)
      $c - Date of publication, distribution, etc. (R)
      $d - Plate or publisher's number for music (Pre-AACR 2) (NR) [LOCAL]
      $e - Place of manufacture (R)
      $f - Manufacturer (R)
      $g - Date of manufacture (R)
      $3 - Materials specified (NR)
      $6 - Linkage (NR)
      $8 - Field link and sequence number (R)

520 - SUMMARY, ETC. (R)
   Indicators
      First - Display constant controller
         # - Summary
         0 - Subject
         1 - Review
         2 - Scope and content
         3 - Abstract
         4 - Content advice
         8 - No display constant generated
      Second - Undefined
         # - Undefined
   Subfield Codes
      $a - Summary, etc. note (NR)
      $b - Expansion of summary note (NR)
      $c - Assigning agency (NR)
      $u - Uniform Resource Identifier (R)
      $z - Source of note information (NR) [OBSOLETE]
      $2 - Source (NR)
      $3 - Materials specified (NR)
      $6 - Linkage (NR)
      $8 - Field link and sequence number (R)
      
505 - FORMATTED CONTENTS NOTE (R)
   Indicators
      First - Display constant controller
         0 - Contents
         1 - Incomplete contents
         2 - Partial contents
         8 - No display constant generated
      Second - Level of content designation
         # - Basic
         0 - Enhanced
   Subfield Codes
      $a - Formatted contents note (NR)
      $g - Miscellaneous information (R)
      $r - Statement of responsibility (R)
      $t - Title (R)
      $u - Uniform Resource Identifier (R) 
      $6 - Linkage (NR)
      $8 - Field link and sequence number (R)

100 - MAIN ENTRY--PERSONAL NAME (NR)
   Indicators
      First - Type of personal name entry element
         0 - Forename
         1 - Surname
         2 - Multiple surname [OBSOLETE]
         3 - Family name
      Second - Undefined
         # - Undefined
      Second - Main entry/subject relationship (BK MU SE) [OBSOLETE]
   Subfield Codes
      $a - Personal name (NR)
      $b - Numeration (NR)
      $c - Titles and other words associated with a name (R)
      $d - Dates associated with a name (NR)
      $e - Relator term (R)
      $f - Date of a work (NR)
      $g - Miscellaneous information (R)
      $j - Attribution qualifier (R)
      $k - Form subheading (R)
      $l - Language of a work (NR)
      $n - Number of part/section of a work (R)
      $p - Name of part/section of a work (R)
      $q - Fuller form of name (NR)
      $t - Title of a work (NR)
      $u - Affiliation (NR)
      $0 - Authority record control number or standard number (R)
      $1 - Real World Object URI (R)
      $4 - Relationship (R)
      $6 - Linkage (NR)
      $8 - Field link and sequence number (R)
      
700 - ADDED ENTRY--PERSONAL NAME (R)
   Indicators
      First - Type of personal name entry element
         0 - Forename
         1 - Surname
         2 - Multiple surname [OBSOLETE]
         3 - Family name
      Second - Type of added entry
         # - No information provided
         0 - Alternative entry (BK CF MP MU SE MX) [OBSOLETE]
         1 - Secondary entry (BK CF MP MU SE MX) [OBSOLETE]
         1 - Printed on card (VM) [OBSOLETE]
         2 - Analytical entry
         3 - Not printed on card (VM) [OBSOLETE]
   Subfield Codes
      $a - Personal name (NR)
      $b - Numeration (NR)
      $c - Titles and other words associated with a name (R)
      $d - Dates associated with a name (NR)
      $e - Relator term (R)
      $f - Date of a work (NR)
      $g - Miscellaneous information (R)
      $h - Medium (NR)
      $i - Relationship information (R)
      $j - Attribution qualifier (R) 
      $k - Form subheading (R)
      $l - Language of a work (NR)
      $m - Medium of performance for music (R)
      $n - Number of part/section of a work (R)
      $o - Arranged statement for music (NR)
      $p - Name of part/section of a work (R)
      $q - Fuller form of name (NR)
      $r - Key for music (NR)
      $s - Version (R)
      $t - Title of a work (NR)
      $u - Affiliation (NR)
      $x - International Standard Serial Number (NR)
      $0 - Authority record control number or standard number (R)
      $1 - Real World Object URI (R)
      $3 - Materials specified (NR)
      $4 - Relationship (R)
      $5 - Institution to which field applies (NR)
      $6 - Linkage (NR)
      $8 - Field link and sequence number (R)