## ContentDM ARK Assignment
Created by Ryan Wolfslayer, 2018, Iowa State University.

Maintained by Wesley Teal, 2019, Iowa State University.

In [None]:
import ast
import glob 
import gzip
from io import StringIO
import os
import os.path
import pandas as pd
import shutil
import subprocess

from lxml import etree
from lxml.etree import parse

import arks_code.download_cdm as cdm 
import arks_code.transformations as trans
from batch_download import batch_download

TARGETPATH = os.path.join(os.getcwd(),'cdm_xml')

# CONFIGURATION VARIABLES ===============================
# Change the following variables to suit your need.

# Enter login information for EZID
ezid_username = ""
ezid_password = ""

# Enter the ARK shoulder you'll be using.
ezid_shoulder = ""

# declare collection number to upload, i.e. p16001coll47
collection_number = "" 



Step 1 (Optional): Download XML from CDM

In [None]:
# OPTIONAL STEP

ff = cdm.DownloadXML()
ff.setup(TARGETPATH, driver_path='arks_code/chromedriver')

ff.driver.close()

# rename files
for file in glob.glob(TARGETPATH+'/export*.xml'):
    root = etree.parse(file).xpath('//rdf:Description/@about', 
                                   namespaces={'rdf':
                                               "http://www.w3.org/1999/02/22-rdf-syntax-ns#"})
    try:
        shutil.move(file, 
                    os.path.join(TARGETPATH, 
                                 str(root[0].split('/')[-3])+'.xml'))
    except IndexError:
        # If we can't get the name because the URL differs from the standard,
        # just move on to the next iteration of the loop and rename all the files
        # we can.
        continue


Step 2: Select Collection and convert to dataframe

In [None]:
# !!! This section is currently unstable and may be subject to major changes.
# There seem to be new issues that crop up from time to time here that halt
# processing dead.

collection_filename = f"{TARGETPATH}/{collection_number}.xml"

with open(collection_filename, "r", encoding="utf-8") as fh:
    in_file = fh.read()
    
# Was getting a 404 error when attempting to resolve the Dublin Core namespace,
# so stripping out namespaces for now to get XPath queries to work. This is 
# perhaps not ideal and may not be permanantly needed. If this does become
# permanent, it should be moved into its own function.
in_file = in_file.replace('<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"\n', '<RDF>')
in_file = in_file.replace('         xmlns:dc="http://purl.org/dc/elements/1.1/">\n', '')
in_file = in_file.replace('dc:', '')
in_file = in_file.replace('rdf:', '')

tree = etree.parse(StringIO(in_file))

# This should probably be moved into its own function if it seems
# like a long-term solution.
metadata = {}
metadata["url"] = tree.xpath("//Description/@about")
metadata["title"] = [e.text for e in tree.xpath("//title")]
metadata["date"] = [e.text for e in tree.xpath("//date")]
metadata["type"] = [e.text for e in tree.xpath("//type")]
metadata["creator"] = [e.text for e in tree.xpath("//creator")]

df = pd.DataFrame.from_dict(metadata)

df = df.iloc[1:]
df['publisher'] = 'Iowa State University Library'

# We DO NOT want to create ARKs for each individual part of an object
# just the object as a whole, so we need to filter out items whose titles
# are Page #, p. #, Front, and Back, or those with no title.
df = df[~df.title.str.match("(^([Pp](age|\.) \d+|[Ff]ront|[Bb]ack)$|^$)")]

Step 3: Confirm collection metadata is accurate, and make corrections. 
Note that this step is subject to change.

In [None]:
# count is incorrect, removing undated rows 
#df = df[df['date']!='undated']
df.shape
df

Step 4:

In [None]:
# REGISTER EZID; this will submit to EZID
# Requires ezid.py be stored in same directory.
#### !!! Change this back to `df.iterrows` !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
for index, row in leftovers.iterrows():
    with open("metadata.txt", "w", encoding="utf-8") as f:
        f.write('erc.who:{}\n'.format(row['creator']))
        f.write('erc.what:{}\n'.format(row['title']))
        f.write('erc.when:{}\n'.format(row['date']))
        
        if row['creator'] != None and str(row['creator']) != 'nan':
            f.write('dc.creator:{}\n'.format(row['creator']))
        else:
            pass
        f.write('dc.title:{}\n'.format(row['title']))
        f.write('dc.publisher:{}\n'.format(row['publisher']))
        if row['date'] != None and str(row['date']) != 'nan':
            f.write('dc.date:{}\n'.format(row['date']))
        else:
            pass
        if row['type'] != None and str(row['type']) != 'nan':
            f.write('dc.type:{}\n'.format(row['type']))
        else:
            pass
        f.write('_target:{}\n'.format(row['url']))
        f.write('_profile:dc')
    result = subprocess.run(["python", "ezid.py", "{}:{}".format(ezid_username, ezid_password), "mint", ezid_shoulder, "@", "metadata.txt"],
                   capture_output=True, 
                   shell=True,
                   encoding="utf-8")
    print(row['title'])
    print(f"STDOUT: {result.stdout}\nSTDERR: {result.stderr}")
    

Step 5:

In [None]:
batch_download(ezid_username, ezid_password, ["format=xml", "type=ark"])

# MAKE SURE there are no gzipped files in the ark directory
# prior to running batch_download or this might grab the wrong
# file.
gzipped_file = next(x for x in os.listdir(".") if x.endswith(".xml.gz"))
output_dir = "ezid_xml"
target_xml = os.path.join(output_dir, gzipped_file[:-3])

with gzip.open(gzipped_file, "rt", encoding="utf-8") as ifh:
    with open(target_xml, "w", encoding="utf-8") as ofh:
        ofh.write(ifh.read())

# Once we've unzipped the file into the ezid_xml directory,
# we can delete the downloaded file.
os.remove(gzipped_file)

## CONTENTdm Upload Process

Step 6: Identify target URLs and format as a dictionary

In [None]:
root = trans.formatupload(collection_number)(parse(target_xml))
my_dict = ast.literal_eval(str(trans.formatxmltodict()(root)))

Step 7: Make sure urls are correct

In [None]:
# [x.text for x in root.xpath('record/element[@name="_target"]')]
print(len(my_dict))
print(my_dict)

## Batch Upload to CONTENTdm with Selenium
You may need to change the following:
* CONTENTdm server
* Driver path
* Sign in with CONTENTdm credentials when prompted
* Collection name value

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
import time
from math import ceil

driver = webdriver.Chrome(r'arks_code/chromedriver')
driver.get('https://server16001.contentdm.oclc.org/cgi-bin/admin/start.exe')

collections_tab = driver.find_element(By.XPATH, "//a[@id='acolls' and @title='collections']")
collections_tab.click()

driver.implicitly_wait(10)
driver.maximize_window()
element = driver.find_element(By.XPATH, '//select[@name="CISODB"]')
sel = Select(element)
sel.select_by_value('/{}'.format(collection_number.replace('/','')))

driver.find_element(By.XPATH, '//input[@type="submit" and @value="change"]').click()

items_tab = driver.find_element(By.XPATH, "//a[@id='aitems' and @title='items']")
items_tab.click()

edit_collection = driver.find_element(By.XPATH, '//a/b[text()="Edit"]')
edit_collection.click()
driver.find_element(By.XPATH, '//*[@id="AllFields"]/div/table/tbody/tr[1]/td[3]/span/a/b').click()

num = int(driver.find_element(By.XPATH, '/html/body/table/tbody/tr[2]/td/table/tbody/tr/td/table[2]/tbody/tr[1]/td/span/b/span' ).text)
table_range = num // 50

# Table Data
x = driver.find_elements(By.XPATH, '//table[@summary="search results"]/tbody/tr/td/span/a[1]')
urllist = [y.get_attribute("href") for y in x]

if table_range < 1:
    pass
else:
    for x in range(table_range):
        driver.find_elements(By.XPATH, '//a[@title="Next page"]')[0].click()
        x = driver.find_elements(By.XPATH, '//table[@summary="search results"]/tbody/tr/td/span/a[1]')
        [urllist.append(y.get_attribute("href")) for y in x]

Step 8: Make sure the urllist/2 and my_dict are equivalent

In [None]:
# The length of urllist is roughly (but not always exactly)
# double the length of my_dict because for each url that
# corresponds to the url in my_dict there is often an additional
# one that includes "edittxt.exe." If we remove this second
# type of url, we get a better number for comparison.
urllist_len = len([x for x in urllist if "edittxt.exe" not in x])
my_dict_len = len(my_dict)

print(urllist_len == my_dict_len)
print(urllist_len)
print(my_dict_len)
#print(urllist)

Step 9: Upload ARKS to CONTENTdm

In [None]:
# iterate through every other url and update identifier AllFields
for item in urllist:
    print(item)
    if 'cdm16001' in item:
            pass
    else:
        driver.get(item)
        # send Ark if title matches ?
        urlfind = driver.find_element(By.XPATH, '/html/body/table/tbody/tr[2]/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr[1]/td/span/a')
        z = urlfind.get_attribute('href')
        z = z.split('/')
        z[4] = 'ref'
        z = '/'.join(z)
        
        # pick only one
        #----------------------------------------------------------------------------------------
        #identifier_field = driver.find_element(By.XPATH, '//input[@name="id"]')
        #identifier_field = driver.find_element(By.XPATH, '//input[@name="identi"]')
        # identifier_field = driver.find_element(By.XPATH, '//input[@name="identa"]')
        # identifier_field = driver.find_element(By.XPATH, '/html/body/table/tbody/tr[2]/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr[28]/td[2]/input')
        identifier_field = driver.find_element(By.XPATH, '//input[@name="uid"]')
        
        
        #----------------------------------------------------------------------------------------
        identifier_field_value = identifier_field.get_attribute('value')
        if identifier_field_value == '':
            try:
                identifier_field.send_keys('{}{}'.format('https://n2t.net/',my_dict[(z[16:].split(',')[0].replace("'",''))]))
    
                # time.sleep(10)
                #--------------------------------------------------------------------
                driver.find_element(By.XPATH, '//input[@id="subbut"]').click()
                #--------------------------------------------------------------------
            except KeyError:
                print('Error on {}'.format(z))
        
        else:
            try:
                if "https://n2t.net/ark:/87292/" not in identifier_field_value:
                    identifier_field.send_keys('; {}{}'.format('https://n2t.net/',my_dict[(z[16:].split(',')[0].replace("'",''))]))
                
                if ";" in identifier_field_value:
                    set_key = '<br>'.join(set(identifier_field.get_attribute('value').strip().split('; ')))
                    identifier_field.clear()
                    identifier_field = driver.find_element(By.XPATH, '//input[@name="identi"]')
                    # identifier_field = driver.find_element(By.XPATH, '//input[@name="uid"]')
                    # identifier_field = driver.find_element(By.XPATH, '/html/body/table/tbody/tr[2]/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td/table/tbody/tr[28]/td[2]/input')
                    identifier_field.send_keys(set_key)
                #--------------------------------------------------------------------
                driver.find_element(By.XPATH, '//input[@id="subbut"]').click()
                #--------------------------------------------------------------------
                # time.sleep(10)
            except KeyError:
                print('Error on {}'.format(z))         

In [None]:
from arks_code import report

info = report.generate_tsv("ezid_xml/d96217d52b.xml")
report.write_tsv(info, "arks_report.tsv")