In [None]:
import pandas as pd
import numpy as np
import requests 
from datetime import datetime

In [None]:
### Pull the existing titles on Direct and check which have been deposited with Crossref ###

In [None]:
df = pd.read_csv('ZiplineLive_titleid_eisbn_doi_sclanding_20191108 2.csv', delimiter='\t', header=None)

In [None]:
df.columns = ['Title_ID', 'ISBN', 'DOI', 'URL']

In [None]:
for index, row in df.iterrows():
    print("{}/{}".format(index, len(df)))
    try:
        URL = "https://api.crossref.org/works?filter=doi:{}".format(row.DOI)
        print(URL)
        r = requests.get(url = URL) 
        data = r.json() 
        response = data['message']['total-results']
        df.loc[index, 'Deposited'] = response
    except Exception as e:
        print(e)
        pass
        


In [None]:
g = df.groupby('Deposited')
g['Deposited'].value_counts()

not_deposited = df[df['Deposited']==0.0]
not_deposited.to_csv('not_deposited.csv')

In [None]:
not_deposited.tail(50)

In [None]:
### Generate Crossref XML ###

# Values needed for deposit:
    # IDTitle
    # auth_1
    # auth_type_1
    # auth_2
    # auth_type_2
    # auth_3
    # auth_type_3
    # auth_4
    # auth_type_4
    # auth_5
    # auth_type_5
    # Title
    # Subtitle
    # ISBN
    # Pub_Date
    # DOI




In [None]:
import_file = "allbooks_export.csv"
import_df = pd.read_csv(import_file, encoding='latin-1', header=None)
import_df.columns = ['Title', 'auth_1', 'auth_type_1', 'auth_2', 
                     'auth_type_2', 'auth_3', 'auth_type_3', 
                     'auth_4', 'auth_type_4', 'auth_5', 'auth_type_5', 
                     'Subtitle', 'Title_ID', 'ISBN', 'Pub_Date', 'DOI']
import_df = import_df.drop(columns=['DOI', 'ISBN'])


In [None]:
df_merge

In [None]:
df_merge = pd.merge(import_df, not_deposited, on='Title_ID')
df_merge['Pub_Date']= pd.to_datetime(df_merge['Pub_Date'], format='%m/%d/%Y') 

In [None]:
# Add a new column with just the month taken from LISTDATE
def get_year(row):
    return row['Pub_Date'].year

df_merge['Year'] = df_merge.apply(get_year, axis=1)

In [None]:
df_merge

In [None]:
dict = {'AU': 'author', 'ED': 'editor', 'TR': 'translator'}

In [None]:
# datetime object containing current date and time
now = datetime.now()
dt_string = now.strftime("%d%m%Y%H%M%S")

In [None]:
xml_header = """<?xml version="1.0" encoding="UTF-8"?>
<doi_batch xmlns="http://www.crossref.org/schema/4.3.7" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="4.3.7" xsi:schemaLocation="http://www.crossref.org/schema/4.3.7 http://www.crossref.org/schemas/crossref4.3.7.xsd">
    <head>
        <doi_batch_id>{}</doi_batch_id>
        <timestamp>{}</timestamp>
        <depositor>
            <depositor_name>The MIT Press</depositor_name>
            <email_address>kmcdouga@mit.edu</email_address>
        </depositor>
        <registrant>The MIT Press</registrant>
    </head>
    <body>""".format("Batch_"+dt_string, dt_string)



In [None]:
def createXML(row):

    master_xml = []
#     for index, row in import_df.iterrows():
    xml = ['<book book_type="monograph"><book_metadata language="en">']
    if row['auth_type_1'] == 'AU' or row['auth_type_1'] == 'ED' or row['auth_type_1'] == 'TR':
        xml.append('<contributors>')
        names = row['auth_1'].split(",")
        try:
            fname = names[1]
            lname = names[0]
            xml.append("""<person_name sequence="first" contributor_role="{}">
                           <given_name>{}</given_name>
                           <surname>{}</surname>
                           </person_name>""".format(dict[row['auth_type_1']], names[1].strip(), names[0].strip()))
        except IndexError:
            xml.append("""<organization sequence="first" contributor_role="author">{}</organization>""".format(row['auth_type_1']))

    if row['auth_type_2'] == 'AU' or row['auth_type_2'] == 'ED' or row['auth_type_2'] == 'TR':
        names = row['auth_2'].split(",")
        try:
            fname = names[1]
            lname = names[0]
            xml.append("""<person_name sequence="first" contributor_role="{}">
                           <given_name>{}</given_name>
                           <surname>{}</surname>
                           </person_name>""".format(dict[row['auth_type_2']], names[1].strip(), names[0].strip()))
        except IndexError:
            xml.append("""<organization sequence="first" contributor_role="author">{}</organization>""".format(row['auth_type_2']))

    if row['auth_type_3'] == 'AU' or row['auth_type_3'] == 'ED' or row['auth_type_3'] == 'TR':
        names = row['auth_3'].split(",")
        try:
            fname = names[1]
            lname = names[0]
            xml.append("""<person_name sequence="first" contributor_role="{}">
                           <given_name>{}</given_name>
                           <surname>{}</surname>
                           </person_name>""".format(dict[row['auth_type_3']], names[1].strip(), names[0].strip()))
        except IndexError:
            xml.append("""<organization sequence="first" contributor_role="author">{}</organization>""".format(row['auth_type_3']))

    if row['auth_type_4'] == 'AU' or row['auth_type_4'] == 'ED' or row['auth_type_4'] == 'TR':
        names = row['auth_4'].split(",")
        try:
            fname = names[1]
            lname = names[0]
            xml.append("""<person_name sequence="first" contributor_role="{}">
                           <given_name>{}</given_name>
                           <surname>{}</surname>
                           </person_name>""".format(dict[row['auth_type_4']], names[1].strip(), names[0].strip()))
        except IndexError:
            xml.append("""<organization sequence="first" contributor_role="author">{}</organization>""".format(row['auth_type_4']))


    if row['auth_type_5'] == 'AU' or row['auth_type_5'] == 'ED' or row['auth_type_5'] == 'TR':
        names = row['auth_5'].split(",")
        try:
            fname = names[1]
            lname = names[0]
            xml.append("""<person_name sequence="first" contributor_role="{}">
                           <given_name>{}</given_name>
                           <surname>{}</surname>
                           </person_name>""".format(dict[row['auth_type_5']], names[1].strip(), names[0].strip()))
        except IndexError:
            xml.append("""<organization sequence="first" contributor_role="author">{}</organization>""".format(row['auth_type_5']))

    xml.append('</contributors>')

    if type(row["Subtitle"]) == str:
        xml.append('<titles><title>{0}</title><subtitle>{1}</subtitle></titles>'.format(row["Title"], row["Subtitle"]))
    if type(row["Subtitle"]) == float:
        xml.append('<titles><title>{0}</title></titles>'.format(row["Title"]))
    if type(row["Subtitle"]) != float and type(row["Subtitle"]) != str:
        print("unrecognized type in \"subtitle\" field")


    xml.append('<publication_date><year>{0}</year></publication_date>'.format(row["Year"]))
    xml.append('<isbn>{0}</isbn>'.format(row["ISBN"]))
    xml.append('<publisher><publisher_name>The MIT Press</publisher_name></publisher>')
    xml.append('<doi_data><doi>{}</doi><resource>{}</resource></doi_data>'.format(row["DOI"], row["URL"]))
    xml.append('</book_metadata></book>')
    master_xml.append(xml)
    return '\n'.join(xml)


In [None]:

with open('out.xml', 'w') as f:
    print(xml_header, file = f)
    print('\n'.join(df_merge.apply(createXML, axis=1)), file=f)  # Python 3.x
    print('</body></doi_batch>', file = f)