## ContentDM ARK Assignment
scripted by Ryan Wolfslayer, Iowa State University

In [2]:
import arks_code.download_cdm as cdm 
import arks_code.transformations as trans
import ast
from batch_download import batch_download
import glob 
import gzip
import os
import os.path
import pandas as pd
import shutil
import subprocess

from lxml import etree
from lxml.etree import parse


Step 1 (Optional): Download XML from CDM

In [None]:
# OPTIONAL STEP

TARGETPATH=os.path.join(os.getcwd(),'cdm_xml')

ff = cdm.DownloadXML()
ff.setup(TARGETPATH, driver_path='arks_code/chromedriver')

[ff.select_and_export(col) for col in cdm.cdm_collections()]
ff.driver.close()

# rename files
for file in glob.glob(TARGETPATH+'/*.xml'):
    root = etree.parse(file).xpath('//rdf:Description/@about', 
                                   namespaces={'rdf':
                                               "http://www.w3.org/1999/02/22-rdf-syntax-ns#"})
    shutil.move(file, os.path.join(TARGETPATH, str(root[0].split('/')[-3])+'.xml'))


Step 2: Select Collection and convert to dataframe

In [8]:
# CHANGE collection_filename to the file you want to use
collection_filename = 'cdm_xml/ada_hayden.xml'

# If you get an encoding error, try experimenting with the pandas encoding
df = pd.read_html(str(trans.dctohtml()(parse(collection_filename))),
                  encoding='ASCII')[0]

df.columns = df.iloc[0]
df = df.iloc[1:]
df['publisher'] = 'Iowa State University Library'

# Dropping nan values on type works in most collections
# df = df.dropna(subset=['type'], axis=0)
df['date'] = df['date'].apply(lambda x: trans.remove_nan(x))

df['title'] = df['title'].apply(lambda x: trans.remove_xml_encoding(str(x)))
df['creator'] = df['creator'].apply(lambda x: trans.remove_xml_encoding(str(x)))


Step 3: Confirm collection metadata is accurate, and make corrections. 
Note that this step is subject to change.

In [9]:
# count is incorrect, removing undated rows 
df = df[df['date']!='undated']
df.shape

(193, 6)

# ENTER LOGIN for EZID

At this point, we are ready for upload, make sure to check the shoulder

In [10]:
# ENTER LOGIN for EZID
import subprocess
username = ''
password = ''

# REPLACE with target
shoulder = 'ark:/replace/this'

Step 4:

In [11]:
# REGISTER EZID; this will submit to EZID
# Requires ezid.py be stored in same directory.
for index, row in df.iterrows():
    with open('metadata.txt','w') as f:
        f.write('erc.who:{}\n'.format(row['creator']))
        f.write('erc.what:{}\n'.format(row['title']))
        f.write('erc.when:{}\n'.format(row['date']))
        print(row['title'])
        if row['creator'] != None and str(row['creator']) != 'nan':
            f.write('dc.creator:{}\n'.format(row['creator']))
        else:
            pass
        f.write('dc.title:{}\n'.format(row['title']))
        f.write('dc.publisher:{}\n'.format(row['publisher']))
        if row['date'] != None and str(row['date']) != 'nan':
            f.write('dc.date:{}\n'.format(row['date']))
        else:
            pass
        if row['type'] != None and str(row['type']) != 'nan':
            f.write('dc.type:{}\n'.format(row['type']))
        else:
            pass
        f.write('_target:{}\n'.format(row['url']))
        f.write('_profile:dc')
    subprocess.call(["python", "ezid.py", "{}:{}".format(username, password), "mint", shoulder, "@", "metadata.txt"], shell=True)

Erigeron (Fleabane)
Erigeron (Fleabane)
Hieracium canadense (Canada hawkweed)
Rudbeckia hirta (Black-eyed Susan)
Monarda fistulosa (Wild bergamot) specimen
Vernonia fasciculata (Ironweed)
Echinocystis lobata (Wild balsam apple) specimen
Eripatorium serolinum (Late-flowering thoroughwort)
Silphium perfoliatum (Indian cup) specimen
Echinocystis lobata (Wild balsam apple)
Aster novae-angliae (New England aster)
Bidens frondosa (Beggar-ticks; Boot-jacks)
Trifolium repens (White clover)
Sicyos angulateo (One-seeded bur cucumber)
Helianthus sp. (Sunflower)
Eupatorium perfoliatum (Boneset)
Liatris scariosa (Blazing star)
Bidens cernua (Stick-tight, smaller bur marigolds)
Bidens cernua (Stick-tight, smaller bur marigold)
Bidens frondoasa and cernua (Leafy-bracted tickseed)
Lycium barbarum L. (Matrimony vine)
Amorpha canescens (Lead plant)
Echinacea purpurea (Coneflower) specimen
Cucumis melo (Muskmelon)
Prenanthes (Rattlesnake root)
Aster multiflorus (Dense flowered aster) specimen
Prairie nea

Letter from Ada Hayden to Elizabeth, Sept. 16, 1909
Letter from Louis Hermann Pammel to Ada Hayden, Sept. 28, 1909
Letter from Ada Hayden to Louis Hermann Pammel, September 26, 1909
Letter from Louis Hermann Pammel to Ada Hayden, March 14, 1917
Letter from Louis Hermann Pammel to Ada Hayden, March 21, 1910
Letter from Ada Hayden to Louis Hermann Pammel, April 5, 1910
Letter from Ada Hayden to Louis Hermann Pammel, May 9, 1910
Letter from Louis Hermann Pammel to Ada Hayden, July 7, 1915
Letter from Louis Hermann Pammel to Ada Hayden, April 11, 1910
Letter from Louis Hermann Pammel to D. M. Hayden, January 29, 1910
Letter from Louis Hermann Pammel to Ada Hayden, May 16, 1917
Letter from Ada Hayden to Louis Hermann Pammel, November 15, 1916
Letter from Louis Hermann Pammel to D. M. Hayden, November 9, 1909
Letter from Louis Hermann Pammel to Ada Hayden, May 2, 1917
Letter from Louis Hermann Pammel to Ada Hayden, November 6, 1909
Letter from Louis Hermann Pammel to Ada Hayden, June 8, 1910

Step 5:

In [12]:
batch_download(username, password, ["format=xml", "type=ark"])

# MAKE SURE there are no gzipped files in the ark directory
# prior to running batch_download or this might grab the wrong
# file.
gzipped_file = next(x for x in os.listdir(".") if x.endswith(".xml.gz"))
output_dir = "ezid_xml"
target_xml = os.path.join(output_dir, gzipped_file[:-3])

with gzip.open(gzipped_file, "rt", encoding="utf-8") as ifh:
    with open(target_xml, "w", encoding="utf-8") as ofh:
        ofh.write(ifh.read())

# Once we've unzipped the file into the ezid_xml directory,
# we can delete the downloaded file.
os.remove(gzipped_file)

submitting download request...
error: method not allowed
 request failed


SystemExit: 1

In [56]:
collection_number = 'p16001coll16/' #i.e. p16001coll47/

## CDM Upload Process

Step 6: Identify target URLs and format as a dictionary

In [57]:
# declare collection number to upload

root = trans.formatupload(collection_number)(parse(target_xml))
my_dict = ast.literal_eval(str(trans.formatxmltodict()(root)))

Step 7: Make sure urls are correct

In [19]:
# [x.text for x in root.xpath('record/element[@name="_target"]')]
print(len(my_dict))
print(my_dict)

289
{'http://cdm16001.contentdm.oclc.org/cdm/ref/collection/p16001coll16/id/458': 'ark:/87292/w90056', 'http://cdm16001.contentdm.oclc.org/cdm/ref/collection/p16001coll16/id/26': 'ark:/87292/w90323', 'http://cdm16001.contentdm.oclc.org/cdm/ref/collection/p16001coll16/id/398': 'ark:/87292/w9033g', 'http://cdm16001.contentdm.oclc.org/cdm/ref/collection/p16001coll16/id/350': 'ark:/87292/w9091s', 'http://cdm16001.contentdm.oclc.org/cdm/ref/collection/p16001coll16/id/284': 'ark:/87292/w90d0f', 'http://cdm16001.contentdm.oclc.org/cdm/ref/collection/p16001coll16/id/175': 'ark:/87292/w90k86', 'http://cdm16001.contentdm.oclc.org/cdm/ref/collection/p16001coll16/id/80': 'ark:/87292/w90p7v', 'http://cdm16001.contentdm.oclc.org/cdm/ref/collection/p16001coll16/id/481': 'ark:/87292/w90s54', 'http://cdm16001.contentdm.oclc.org/cdm/ref/collection/p16001coll16/id/49': 'ark:/87292/w90w55', 'http://cdm16001.contentdm.oclc.org/cdm/ref/collection/p16001coll16/id/18': 'ark:/87292/w91044', 'http://cdm16001.co

## Batch Upload to CDM with Selenium
You may need to change the following:
* ContentDM server
* Driver path
* Sign in with ContentDM Credentials when prompted
* Collection name value

In [58]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
import time
from math import ceil

driver = webdriver.Chrome(r'arks_code/chromedriver')
driver.get('https://server16001.contentdm.oclc.org/cgi-bin/admin/start.exe')

collections_tab = driver.find_element(By.XPATH, "//a[@id='acolls' and @title='collections']")
collections_tab.click()

driver.implicitly_wait(10)
driver.maximize_window()
element = driver.find_element(By.XPATH, '//select[@name="CISODB"]')
sel = Select(element)
sel.select_by_value('/{}'.format(collection_number.replace('/','')))

driver.find_element(By.XPATH, '//input[@type="submit" and @value="change"]').click()

items_tab = driver.find_element(By.XPATH, "//a[@id='aitems' and @title='items']")
items_tab.click()

edit_collection = driver.find_element(By.XPATH, '//a/b[text()="Edit"]')
edit_collection.click()
driver.find_element(By.XPATH, '//*[@id="AllFields"]/div/table/tbody/tr[1]/td[3]/span/a/b').click()

num = int(driver.find_element(By.XPATH, '/html/body/table/tbody/tr[2]/td/table/tbody/tr/td/table[2]/tbody/tr[1]/td/span/b/span' ).text)
table_range = num // 50

# Table Data
x = driver.find_elements(By.XPATH, '//table[@summary="search results"]/tbody/tr/td/span/a[1]')
urllist = [y.get_attribute("href") for y in x]

if table_range < 1:
    pass
else:
    for x in range(table_range):
        driver.find_elements(By.XPATH, '//a[@title="Next page"]')[0].click()
        x = driver.find_elements(By.XPATH, '//table[@summary="search results"]/tbody/tr/td/span/a[1]')
        [urllist.append(y.get_attribute("href")) for y in x]

Step 8: Make sure the urllist/2 and my_dict are equivalent

In [59]:
# The lengtho of urllist is roughly (but not always exactly)
# double the length of my_dict because for each url that
# corresponds to the url in my_dict there is often an additional
# one that includes "edittxt.exe." If we remove this second
# type of url, we get a better number for comparison.
urllist_len = len([x for x in urllist if "edittxt.exe" not in x])
my_dict_len = len(my_dict)

print(urllist_len == my_dict_len)
print(urllist_len)
print(my_dict_len)

False
289
20


Step 9: Upload ARKS to CDM

In [69]:
# iterate through every other url and update identifier AllFields
for item in urllist2:
    print(item)
    if 'cdm16001' in item:
            pass
    else:
        driver.get(item)
        # send Ark if title matches ?
        urlfind = driver.find_element(By.XPATH, '/html/body/table/tbody/tr[2]/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr[1]/td/span/a')
        z = urlfind.get_attribute('href')
        z = z.split('/')
        z[4] = 'ref'
        z = '/'.join(z)
        
        # pick only one
        #----------------------------------------------------------------------------------------
        identifier_field = driver.find_element(By.XPATH, '//input[@name="identi"]')
        # identifier_field = driver.find_element(By.XPATH, '//input[@name="identa"]')
        # identifier_field = driver.find_element(By.XPATH, '/html/body/table/tbody/tr[2]/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr[28]/td[2]/input')
        # identifier_field = driver.find_element(By.XPATH, '//input[@name="uid"]')
        
        
        #----------------------------------------------------------------------------------------
        identifier_field_value = identifier_field.get_attribute('value')
        if identifier_field_value == '':
            try:
                identifier_field.send_keys('{}{}'.format('https://n2t.net/',my_dict[(z[16:].split(',')[0].replace("'",''))]))
    
                # time.sleep(10)
                #--------------------------------------------------------------------
                driver.find_element(By.XPATH, '//input[@id="subbut"]').click()
                #--------------------------------------------------------------------
            except KeyError:
                print('Error on {}'.format(z))
        
        else:
            try:
                if "https://n2t.net/ark:/87292/" not in identifier_field_value:
                    identifier_field.send_keys('; {}{}'.format('https://n2t.net/',my_dict[(z[16:].split(',')[0].replace("'",''))]))
                
                if ";" in identifier_field_value:
                    set_key = '<br>'.join(set(identifier_field.get_attribute('value').strip().split('; ')))
                    identifier_field.clear()
                    identifier_field = driver.find_element(By.XPATH, '//input[@name="identi"]')
                    # identifier_field = driver.find_element(By.XPATH, '//input[@name="uid"]')
                    # identifier_field = driver.find_element(By.XPATH, '/html/body/table/tbody/tr[2]/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr[28]/td[2]/input')
                    identifier_field.send_keys(set_key)
                #--------------------------------------------------------------------
                driver.find_element(By.XPATH, '//input[@id="subbut"]').click()
                #--------------------------------------------------------------------
                # time.sleep(10)
            except KeyError:
                print('Error on {}'.format(z))

            

http://cdm16001.contentdm.oclc.org/cdm/singleitem/collection/p16001coll16/id/68
http://cdm16001.contentdm.oclc.org/cdm/singleitem/collection/p16001coll16/id/40
http://cdm16001.contentdm.oclc.org/cdm/singleitem/collection/p16001coll16/id/16
http://cdm16001.contentdm.oclc.org/cdm/singleitem/collection/p16001coll16/id/83
http://cdm16001.contentdm.oclc.org/cdm/singleitem/collection/p16001coll16/id/59
http://cdm16001.contentdm.oclc.org/cdm/singleitem/collection/p16001coll16/id/34
http://cdm16001.contentdm.oclc.org/cdm/singleitem/collection/p16001coll16/id/62
http://cdm16001.contentdm.oclc.org/cdm/singleitem/collection/p16001coll16/id/32
http://cdm16001.contentdm.oclc.org/cdm/singleitem/collection/p16001coll16/id/30
http://cdm16001.contentdm.oclc.org/cdm/singleitem/collection/p16001coll16/id/29
http://cdm16001.contentdm.oclc.org/cdm/singleitem/collection/p16001coll16/id/33
http://cdm16001.contentdm.oclc.org/cdm/singleitem/collection/p16001coll16/id/18
http://cdm16001.contentdm.oclc.org/cdm/s