## ContentDM ARK Assignment
scripted by Ryan Wolfslayer, Iowa State Unviersity

In [None]:
import arks_code.download_cdm as cdm 
import arks_code.transformations as trans
import ast
import glob 
import os 
import pandas as pd
import shutil
import subprocess

from lxml import etree
from lxml.etree import parse


Step 1: Download XML from CDM

In [None]:
# OPTIONAL STEP

TARGETPATH=os.path.join(os.getcwd(),'cdm_xml')

ff = cdm.DownloadXML()
ff.setup(TARGETPATH, driver_path='arks_code/chromedriver')

[ff.select_and_export(col) for col in cdm.cdm_collections()]
ff.driver.close()

# rename files
for file in glob.glob(TARGETPATH+'/*.xml'):
    root = etree.parse(file).xpath('//rdf:Description/@about', 
                                   namespaces={'rdf':
                                               "http://www.w3.org/1999/02/22-rdf-syntax-ns#"})
    shutil.move(file, os.path.join(TARGETPATH, str(root[0].split('/')[-3])+'.xml'))


Step 2: Select Collection and convert to dataframe

In [None]:
collection_filename = 'cdm_xml/isu_historical_maps.xml'


# If you get an encoding error, try experimenting with the pandas encoding
df = pd.read_html(str(trans.dctohtml()(parse(collection_filename))),
                  encoding='ASCII')[0]

df.columns = df.iloc[0]
df = df.iloc[1:]
df['publisher'] = 'Iowa State University Library'

# Dropping nan values on type works in most collections
# df = df.dropna(subset=['type'], axis=0)
df['date'] = df['date'].apply(lambda x: trans.remove_nan(x))

df['title'] = df['title'].apply(lambda x: trans.remove_xml_encoding(str(x)))
df['creator'] = df['creator'].apply(lambda x: trans.remove_xml_encoding(str(x)))


Step 3: Confirm collection metadata is accurate, and make corrections
Note that data cleaning may change depending on the collection.

In [None]:
# count is incorrect, removing undated rows 
df = df[df['date']!='undated']
df.shape

In [None]:
# ENTER LOGIN for EZID
import subprocess
username = ''
password = ''

# ENTER LOGIN for EZID

At this point, we are ready for upload, make sure to check the shoulder
This part also requires python2

Step 4:

In [None]:
# REGISTER EZID; this will submit to EZID
# Requires ezid.py be stored in same directory; make sure you are using python2
# Change the ark in the subprocess like to reflect desired shoulder

for index, row in df.iterrows():
    with open('metadata.txt','w') as f:
        f.write('erc.who:{}\n'.format(row['creator']))
        f.write('erc.what:{}\n'.format(row['title']))
        f.write('erc.when:{}\n'.format(row['date']))
        print(row['title'])
        if row['creator'] != None and str(row['creator']) != 'nan':
            f.write('dc.creator:{}\n'.format(row['creator']))
        else:
            pass
        f.write('dc.title:{}\n'.format(row['title']))
        f.write('dc.publisher:{}\n'.format(row['publisher']))
        if row['date'] != None and str(row['date']) != 'nan':
            f.write('dc.date:{}\n'.format(row['date']))
        else:
            pass
        if row['type'] != None and str(row['type']) != 'nan':
            f.write('dc.type:{}\n'.format(row['type']))
        else:
            pass
        f.write('_target:{}\n'.format(row['url']))
        f.write('_profile:dc')
    subprocess.call(["python", "ezid.py", "{}:{}".format(username, password), "mint", "", "@", "metadata.txt"], shell=True)

Step 5:

## Run Shell Script
The EZID shell script might only run in a linux environment, please locate batch-download and run the following command in a linux shell

* ./batch-download.sh iastate_lib PASSWORD format=xml type=ark

This command will download an xml.gz file; make sure you have gunzip

% gunzip -d [file].xml.gz

The resulting xml file will be used in the next step

## CDM Upload Process

Step 6: Identify target URLs and format as a dictionary

In [None]:
# declare collection number to upload
# make sure to include the backslash to avoid overlap

#'p16001coll47/'

collection_number = 'p16001coll47/'
target_xml = 'ezid_xml/03a533575b.xml'

root = trans.formatupload(collection_number)(parse(target_xml))
my_dict = ast.literal_eval(unicode(trans.formatxmltodict()(root)).encode('utf-8'))

Step 7: Make sure urls are correct

In [None]:
# [x.text for x in root.xpath('record/element[@name="_target"]')]
print(len(my_dict))
print(my_dict)

## Batch Upload to CDM with Selenium
You will need to change the following:
* ContentDM server
* Driver path
* Sign in with ContentDM Credentials when prompted
* Collection name value

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
import time

driver = webdriver.Chrome(r'D:/Users/rwolfsla/Desktop/software/chromedriver')
driver.get('https://server16001.contentdm.oclc.org/cgi-bin/admin/start.exe')

collections_tab = driver.find_element(By.XPATH, "//a[@id='acolls' and @title='collections']")
collections_tab.click()

driver.implicitly_wait(10)
driver.maximize_window()
element = driver.find_element(By.XPATH, '//select[@name="CISODB"]')
sel = Select(element)
sel.select_by_value('/{}'.format(collection_number.replace('/','')))

driver.find_element(By.XPATH, '//input[@type="submit" and @value="change"]').click()

items_tab = driver.find_element(By.XPATH, "//a[@id='aitems' and @title='items']")
items_tab.click()

edit_collection = driver.find_element(By.XPATH, '//a/b[text()="Edit"]')
edit_collection.click()
driver.find_element(By.XPATH, '//*[@id="AllFields"]/div/table/tbody/tr[1]/td[3]/span/a/b').click()

num = driver.find_element(By.XPATH, '/html/body/table/tbody/tr[2]/td/table/tbody/tr/td/table[2]/tbody/tr[1]/td/span/b/span' ).text
table_range = int(num)/50

# Table Data
x = driver.find_elements(By.XPATH, '//table[@summary="search results"]/tbody/tr/td/span/a[1]')
urllist = [y.get_attribute("href") for y in x]

if table_range < 1:
    pass
else:
    for x in range(table_range):
        driver.find_elements(By.XPATH, '//a[@title="Next page"]')[0].click()
        x = driver.find_elements(By.XPATH, '//table[@summary="search results"]/tbody/tr/td/span/a[1]')
        [urllist.append(y.get_attribute("href")) for y in x]

Step 8: Make sure the urllist/2 and my_dict are equivalent

In [None]:
print(len(urllist)/2 == len(my_dict))
print(len(urllist)/2)
print(len(my_dict))

Step 9: Upload ARKS to CDM

In [None]:
# iterate through every other url and update identifier AllFields
for item in urllist:
    print(item)
    if 'cdm16001' in item:
            pass
    else:
        driver.get(item)
        # send Ark if title matches ?
        urlfind = driver.find_element(By.XPATH, '/html/body/table/tbody/tr[2]/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr[1]/td/span/a')
        z = urlfind.get_attribute('href')
        z = z.split('/')
        z[4] = 'ref'
        z = '/'.join(z)
        
        # pick only one
        #----------------------------------------------------------------------------------------
        identifier_field = driver.find_element(By.XPATH, '//input[@name="identi"]')
        # identifier_field = driver.find_element(By.XPATH, '//input[@name="identa"]')
        # identifier_field = driver.find_element(By.XPATH, '/html/body/table/tbody/tr[2]/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr[28]/td[2]/input')
        # identifier_field = driver.find_element(By.XPATH, '//input[@name="uid"]')
        
        
        #----------------------------------------------------------------------------------------
        if identifier_field.get_attribute('value')=='':
            try:
                identifier_field.send_keys('{}{}'.format('https://n2t.net/',my_dict[(z[16:].split(',')[0].replace("'",''))]))
    
                # time.sleep(10)
                #--------------------------------------------------------------------
                driver.find_element(By.XPATH, '//input[@id="subbut"]').click()
                #--------------------------------------------------------------------
            except KeyError:
                print('Error on {}'.format(z))
        
        else:
            try:
                identifier_field.send_keys('; {}{}'.format('https://n2t.net/',my_dict[(z[16:].split(',')[0].replace("'",''))]))
                set_key = '<br>'.join(set(identifier_field.get_attribute('value').strip().split('; ')))
                identifier_field.clear()
                identifier_field = driver.find_element(By.XPATH, '//input[@name="identi"]')
                # identifier_field = driver.find_element(By.XPATH, '//input[@name="uid"]')
                # identifier_field = driver.find_element(By.XPATH, '/html/body/table/tbody/tr[2]/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr[28]/td[2]/input')
                identifier_field.send_keys(set_key)
                #--------------------------------------------------------------------
                driver.find_element(By.XPATH, '//input[@id="subbut"]').click()
                #--------------------------------------------------------------------
                # time.sleep(10)
            except KeyError:
                print('Error on {}'.format(z))

            