# Dataset Extraction Example

This notebook is an example to extract a dataset from a digital collection described using marcxml files.
This notebook uses thee descriptive metadata from the Moving Image Archive catalogue, which is Scotlandâ€™s national collection of moving images.

### Setting up things

In [75]:
# import the libraries we need
# https://pypi.org/project/pymarc/
import pymarc, regex, csv
from pymarc import parse_xml_to_array    

### Reading original files

In [76]:
csv_out = csv.writer(open('marc_records.csv', 'w'), delimiter = ',', quotechar = '"', quoting = csv.QUOTE_MINIMAL)
csv_out.writerow(['title', 'author', 'place_production', 'date', 'extents', 'credits_note', 'subjects', 'summary', 'publisher'])

records = parse_xml_to_array(open('Moving-Image-Archive/Moving-Image-Archive-dataset-MARC.xml'))

for record in records:
    
    title = author = place_production = date = extents = credits_note = subjects = summary = publisher = ''
    
    # title
    if record['245'] is not None:
      title = record['245']['a']
      if record['245']['b'] is not None:
        title = title + " " + record['245']['b']
    
    # determine author
    if record['100'] is not None:
      author = record['100']['a']
    elif record['110'] is not None:
      author = record['110']['a']
    elif record['700'] is not None:
      author = record['700']['a']
    elif record['710'] is not None:
      author = record['710']['a']
    
    # place_production
    if record['264'] is not None:
      place_production = record['264']['a']
    
    # date
    if record['264'] is not None:
      date = record['264']['c']
    
    # Physical Description - extent
    if record['300'] is not None:
      extent = record['300']['a']
    
    # Creation/production credits note
    if record['508'] is not None:
      credits_note = record['508']['a']
    
    # Summary
    if record['520'] is not None:
      summary = record['520']['a']
    
    # subject
    if record['653'] is not None:
        subjects = '' 
        for f in record.get_fields('653'):
            subjects += f.get_subfields('a')[0] + ' -- '
        subjects = re.sub(' -- $', '', subjects)
    
    # publisher
    if record['260'] is not None:
      publisher = record['260']['b']
    
    csv_out.writerow([title,author,place_production,date,extents,credits_note,subjects,summary,publisher])

## Reading CSV 